From 4b239f458c229de044d6905c2b0f9fe16ed9e01e Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 17 Dec 2010 16:58:28 -0800 Subject: [PATCH 001/706] x86-64, mm: Put early page table high While dubug kdump, found current kernel will have problem with crashkernel=512M. It turns out that initial mapping is to 512M, and later initial mapping to 4G (acutally is 2040M in my platform), will put page table near 512M. then initial mapping to 128g will be near 2g. before this patch: [ 0.000000] initial memory mapped : 0 - 20000000 [ 0.000000] init_memory_mapping: [0x00000000000000-0x0000007f74ffff] [ 0.000000] 0000000000 - 007f600000 page 2M [ 0.000000] 007f600000 - 007f750000 page 4k [ 0.000000] kernel direct mapping tables up to 7f750000 @ [0x1fffc000-0x1fffffff] [ 0.000000] memblock_x86_reserve_range: [0x1fffc000-0x1fffdfff] PGTABLE [ 0.000000] init_memory_mapping: [0x00000100000000-0x0000207fffffff] [ 0.000000] 0100000000 - 2080000000 page 2M [ 0.000000] kernel direct mapping tables up to 2080000000 @ [0x7bc01000-0x7bc83fff] [ 0.000000] memblock_x86_reserve_range: [0x7bc01000-0x7bc7efff] PGTABLE [ 0.000000] RAMDISK: 7bc84000 - 7f745000 [ 0.000000] crashkernel reservation failed - No suitable area found. after patch: [ 0.000000] initial memory mapped : 0 - 20000000 [ 0.000000] init_memory_mapping: [0x00000000000000-0x0000007f74ffff] [ 0.000000] 0000000000 - 007f600000 page 2M [ 0.000000] 007f600000 - 007f750000 page 4k [ 0.000000] kernel direct mapping tables up to 7f750000 @ [0x7f74c000-0x7f74ffff] [ 0.000000] memblock_x86_reserve_range: [0x7f74c000-0x7f74dfff] PGTABLE [ 0.000000] init_memory_mapping: [0x00000100000000-0x0000207fffffff] [ 0.000000] 0100000000 - 2080000000 page 2M [ 0.000000] kernel direct mapping tables up to 2080000000 @ [0x207ff7d000-0x207fffffff] [ 0.000000] memblock_x86_reserve_range: [0x207ff7d000-0x207fffafff] PGTABLE [ 0.000000] RAMDISK: 7bc84000 - 7f745000 [ 0.000000] memblock_x86_reserve_range: [0x17000000-0x36ffffff] CRASH KERNEL [ 0.000000] Reserving 512MB of memory at 368MB for crashkernel (System RAM: 133120MB) It means with the patch, page table for [0, 2g) will need 2g, instead of under 512M, page table for [4g, 128g) will be near 128g, instead of under 2g. That would good, if we have lots of memory above 4g, like 1024g, or 2048g or 16T, will not put related page table under 2g. that would be have chance to fill the under 2g if 1G or 2M page is not used. the code change will use add map_low_page() and update unmap_low_page() for 64bit, and use them to get access the corresponding high memory for page table setting. Signed-off-by: Yinghai Lu LKML-Reference: <4D0C0734.7060900@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 9 +++---- arch/x86/mm/init_64.c | 63 +++++++++++++++++++------------------------ 2 files changed, 30 insertions(+), 42 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index c0e28a13de7d..5863950ebe0c 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -33,7 +33,7 @@ int direct_gbpages static void __init find_early_table_space(unsigned long end, int use_pse, int use_gbpages) { - unsigned long puds, pmds, ptes, tables, start; + unsigned long puds, pmds, ptes, tables, start = 0, good_end = end; phys_addr_t base; puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; @@ -73,12 +73,9 @@ static void __init find_early_table_space(unsigned long end, int use_pse, * need roughly 0.5KB per GB. */ #ifdef CONFIG_X86_32 - start = 0x7000; -#else - start = 0x8000; + good_end = max_pfn_mapped << PAGE_SHIFT; #endif - base = memblock_find_in_range(start, max_pfn_mapped< Date: Fri, 17 Dec 2010 16:58:40 -0800 Subject: [PATCH 002/706] x86-64, gart: Fix allocation with memblock When trying to change alloc_bootmem with memblock to go with real top-down Found one old system: [ 0.000000] Node 0: aperture @ ac000000 size 64 MB [ 0.000000] Aperture pointing to e820 RAM. Ignoring. [ 0.000000] Your BIOS doesn't leave a aperture memory hole [ 0.000000] Please enable the IOMMU option in the BIOS setup [ 0.000000] This costs you 64 MB of RAM [ 0.000000] memblock_x86_reserve_range: [0x2020000000-0x2023ffffff] aperture64 [ 0.000000] Cannot allocate aperture memory hole (ffff882020000000,65536K) [ 0.000000] memblock_x86_free_range: [0x2020000000-0x2023ffffff] [ 0.000000] Kernel panic - not syncing: Not enough memory for aperture [ 0.000000] Pid: 0, comm: swapper Not tainted 2.6.37-rc5-tip-yh-06229-gb792dc2-dirty #331 [ 0.000000] Call Trace: [ 0.000000] [] ? panic+0x91/0x1a3 [ 0.000000] [] ? gart_iommu_hole_init+0x3d7/0x4a3 [ 0.000000] [] ? _etext+0x0/0x3 [ 0.000000] [] ? pci_iommu_alloc+0x47/0x71 [ 0.000000] [] ? mem_init+0x19/0xec [ 0.000000] [] ? start_kernel+0x20a/0x3e8 [ 0.000000] [] ? x86_64_start_reservations+0x9c/0xa0 [ 0.000000] [] ? x86_64_start_kernel+0x114/0x11b it means __alloc_bootmem_nopanic() get too high for that aperture. Use memblock_find_in_range() with limit directly. Signed-off-by: Yinghai Lu LKML-Reference: <4D0C0740.90104@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/aperture_64.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index dcd7c83e1659..85f66b4f4fee 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include @@ -69,7 +69,7 @@ static void __init insert_aperture_resource(u32 aper_base, u32 aper_size) static u32 __init allocate_aperture(void) { u32 aper_size; - void *p; + unsigned long addr; /* aper_size should <= 1G */ if (fallback_aper_order > 5) @@ -95,27 +95,26 @@ static u32 __init allocate_aperture(void) * so don't use 512M below as gart iommu, leave the space for kernel * code for safe */ - p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20); + addr = memblock_find_in_range(0, 1ULL<<32, aper_size, 512ULL<<20); + if (addr == MEMBLOCK_ERROR || addr + aper_size > 0xffffffff) { + printk(KERN_ERR + "Cannot allocate aperture memory hole (%lx,%uK)\n", + addr, aper_size>>10); + return 0; + } + memblock_x86_reserve_range(addr, addr + aper_size, "aperture64"); /* * Kmemleak should not scan this block as it may not be mapped via the * kernel direct mapping. */ - kmemleak_ignore(p); - if (!p || __pa(p)+aper_size > 0xffffffff) { - printk(KERN_ERR - "Cannot allocate aperture memory hole (%p,%uK)\n", - p, aper_size>>10); - if (p) - free_bootmem(__pa(p), aper_size); - return 0; - } + kmemleak_ignore(phys_to_virt(addr)); printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", - aper_size >> 10, __pa(p)); - insert_aperture_resource((u32)__pa(p), aper_size); - register_nosave_region((u32)__pa(p) >> PAGE_SHIFT, - (u32)__pa(p+aper_size) >> PAGE_SHIFT); + aper_size >> 10, addr); + insert_aperture_resource((u32)addr, aper_size); + register_nosave_region(addr >> PAGE_SHIFT, + (addr+aper_size) >> PAGE_SHIFT); - return (u32)__pa(p); + return (u32)addr; } From 1a4a678b12c84db9ae5dce424e0e97f0559bb57c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 17 Dec 2010 16:59:07 -0800 Subject: [PATCH 003/706] memblock: Make find_memory_core_early() find from top-down That is used for find ram in node or bootmem type. We should make it top-down so it will be consistent to memblock_find, and to avoid allocating potentially valuable low memory before we actually need it. Suggested-by: Jeremy Fitzhardinge Signed-off-by: Yinghai Lu LKML-Reference: <4D0C075B.3040501@kernel.org> Signed-off-by: H. Peter Anvin --- mm/page_alloc.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 07a654486f75..19413bfdef92 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3555,6 +3555,34 @@ static int __meminit next_active_region_index_in_nid(int index, int nid) return -1; } +/* + * Basic iterator support. Return the last range of PFNs for a node + * Note: nid == MAX_NUMNODES returns last region regardless of node + */ +static int __meminit last_active_region_index_in_nid(int nid) +{ + int i; + + for (i = nr_nodemap_entries - 1; i >= 0; i--) + if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) + return i; + + return -1; +} + +/* + * Basic iterator support. Return the previous active range of PFNs for a node + * Note: nid == MAX_NUMNODES returns next region regardless of node + */ +static int __meminit previous_active_region_index_in_nid(int index, int nid) +{ + for (index = index - 1; index >= 0; index--) + if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) + return index; + + return -1; +} + #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID /* * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. @@ -3606,6 +3634,10 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node) for (i = first_active_region_index_in_nid(nid); i != -1; \ i = next_active_region_index_in_nid(i, nid)) +#define for_each_active_range_index_in_nid_reverse(i, nid) \ + for (i = last_active_region_index_in_nid(nid); i != -1; \ + i = previous_active_region_index_in_nid(i, nid)) + /** * free_bootmem_with_active_regions - Call free_bootmem_node for each active range * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. @@ -3644,7 +3676,7 @@ u64 __init find_memory_core_early(int nid, u64 size, u64 align, int i; /* Need to go over early_node_map to find out good range for node */ - for_each_active_range_index_in_nid(i, nid) { + for_each_active_range_index_in_nid_reverse(i, nid) { u64 addr; u64 ei_start, ei_last; u64 final_start, final_end; From 45635ab5e41bcde94a82f9a05d660ef77fe38c1b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 27 Dec 2010 16:47:54 -0800 Subject: [PATCH 004/706] x86: Change get_max_mapped() to inline Move it into head file. to prepare use it in other files. [ hpa: added missing and changed type to phys_addr_t. ] Signed-off-by: Yinghai Lu LKML-Reference: <4D1933BA.8000508@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/page_types.h | 6 ++++++ arch/x86/kernel/setup.c | 9 --------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 1df66211fd1b..93626e699679 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -2,6 +2,7 @@ #define _ASM_X86_PAGE_DEFS_H #include +#include /* PAGE_SHIFT determines the page size */ #define PAGE_SHIFT 12 @@ -45,6 +46,11 @@ extern int devmem_is_allowed(unsigned long pagenr); extern unsigned long max_low_pfn_mapped; extern unsigned long max_pfn_mapped; +static inline phys_addr_t get_max_mapped(void) +{ + return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT; +} + extern unsigned long init_memory_mapping(unsigned long start, unsigned long end); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index df172c1e8238..3def8c9a5dc9 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -669,15 +669,6 @@ static int __init parse_reservelow(char *p) early_param("reservelow", parse_reservelow); -static u64 __init get_max_mapped(void) -{ - u64 end = max_pfn_mapped; - - end <<= PAGE_SHIFT; - - return end; -} - /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures From dbef7b56d2fc5115f26f72a0b080283bbf972cab Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 27 Dec 2010 16:48:08 -0800 Subject: [PATCH 005/706] x86-64, numa: Allocate memnodemap under max_pfn_mapped We need to access it right way, so make sure that it is mapped already. Prepare to put page table on local node, and nodemap is used before that. Signed-off-by: Yinghai Lu LKML-Reference: <4D1933C8.7060105@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 7762a517d69d..02d36ff85ebd 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -87,7 +87,7 @@ static int __init allocate_cachealigned_memnodemap(void) addr = 0x8000; nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); - nodemap_addr = memblock_find_in_range(addr, max_pfn< Date: Mon, 27 Dec 2010 16:48:17 -0800 Subject: [PATCH 006/706] x86-64, numa: Put pgtable to local node memory Introduce init_memory_mapping_high(), and use it with 64bit. It will go with every memory segment above 4g to create page table to the memory range itself. before this patch all page tables was on one node. with this patch, one RED-PEN is killed debug out for 8 sockets system after patch [ 0.000000] initial memory mapped : 0 - 20000000 [ 0.000000] init_memory_mapping: [0x00000000000000-0x0000007f74ffff] [ 0.000000] 0000000000 - 007f600000 page 2M [ 0.000000] 007f600000 - 007f750000 page 4k [ 0.000000] kernel direct mapping tables up to 7f750000 @ [0x7f74c000-0x7f74ffff] [ 0.000000] RAMDISK: 7bc84000 - 7f745000 .... [ 0.000000] Adding active range (0, 0x10, 0x95) 0 entries of 3200 used [ 0.000000] Adding active range (0, 0x100, 0x7f750) 1 entries of 3200 used [ 0.000000] Adding active range (0, 0x100000, 0x1080000) 2 entries of 3200 used [ 0.000000] Adding active range (1, 0x1080000, 0x2080000) 3 entries of 3200 used [ 0.000000] Adding active range (2, 0x2080000, 0x3080000) 4 entries of 3200 used [ 0.000000] Adding active range (3, 0x3080000, 0x4080000) 5 entries of 3200 used [ 0.000000] Adding active range (4, 0x4080000, 0x5080000) 6 entries of 3200 used [ 0.000000] Adding active range (5, 0x5080000, 0x6080000) 7 entries of 3200 used [ 0.000000] Adding active range (6, 0x6080000, 0x7080000) 8 entries of 3200 used [ 0.000000] Adding active range (7, 0x7080000, 0x8080000) 9 entries of 3200 used [ 0.000000] init_memory_mapping: [0x00000100000000-0x0000107fffffff] [ 0.000000] 0100000000 - 1080000000 page 2M [ 0.000000] kernel direct mapping tables up to 1080000000 @ [0x107ffbd000-0x107fffffff] [ 0.000000] memblock_x86_reserve_range: [0x107ffc2000-0x107fffffff] PGTABLE [ 0.000000] init_memory_mapping: [0x00001080000000-0x0000207fffffff] [ 0.000000] 1080000000 - 2080000000 page 2M [ 0.000000] kernel direct mapping tables up to 2080000000 @ [0x207ff7d000-0x207fffffff] [ 0.000000] memblock_x86_reserve_range: [0x207ffc0000-0x207fffffff] PGTABLE [ 0.000000] init_memory_mapping: [0x00002080000000-0x0000307fffffff] [ 0.000000] 2080000000 - 3080000000 page 2M [ 0.000000] kernel direct mapping tables up to 3080000000 @ [0x307ff3d000-0x307fffffff] [ 0.000000] memblock_x86_reserve_range: [0x307ffc0000-0x307fffffff] PGTABLE [ 0.000000] init_memory_mapping: [0x00003080000000-0x0000407fffffff] [ 0.000000] 3080000000 - 4080000000 page 2M [ 0.000000] kernel direct mapping tables up to 4080000000 @ [0x407fefd000-0x407fffffff] [ 0.000000] memblock_x86_reserve_range: [0x407ffc0000-0x407fffffff] PGTABLE [ 0.000000] init_memory_mapping: [0x00004080000000-0x0000507fffffff] [ 0.000000] 4080000000 - 5080000000 page 2M [ 0.000000] kernel direct mapping tables up to 5080000000 @ [0x507febd000-0x507fffffff] [ 0.000000] memblock_x86_reserve_range: [0x507ffc0000-0x507fffffff] PGTABLE [ 0.000000] init_memory_mapping: [0x00005080000000-0x0000607fffffff] [ 0.000000] 5080000000 - 6080000000 page 2M [ 0.000000] kernel direct mapping tables up to 6080000000 @ [0x607fe7d000-0x607fffffff] [ 0.000000] memblock_x86_reserve_range: [0x607ffc0000-0x607fffffff] PGTABLE [ 0.000000] init_memory_mapping: [0x00006080000000-0x0000707fffffff] [ 0.000000] 6080000000 - 7080000000 page 2M [ 0.000000] kernel direct mapping tables up to 7080000000 @ [0x707fe3d000-0x707fffffff] [ 0.000000] memblock_x86_reserve_range: [0x707ffc0000-0x707fffffff] PGTABLE [ 0.000000] init_memory_mapping: [0x00007080000000-0x0000807fffffff] [ 0.000000] 7080000000 - 8080000000 page 2M [ 0.000000] kernel direct mapping tables up to 8080000000 @ [0x807fdfc000-0x807fffffff] [ 0.000000] memblock_x86_reserve_range: [0x807ffbf000-0x807fffffff] PGTABLE [ 0.000000] Initmem setup node 0 [0000000000000000-000000107fffffff] [ 0.000000] NODE_DATA [0x0000107ffbd000-0x0000107ffc1fff] [ 0.000000] Initmem setup node 1 [0000001080000000-000000207fffffff] [ 0.000000] NODE_DATA [0x0000207ffbb000-0x0000207ffbffff] [ 0.000000] Initmem setup node 2 [0000002080000000-000000307fffffff] [ 0.000000] NODE_DATA [0x0000307ffbb000-0x0000307ffbffff] [ 0.000000] Initmem setup node 3 [0000003080000000-000000407fffffff] [ 0.000000] NODE_DATA [0x0000407ffbb000-0x0000407ffbffff] [ 0.000000] Initmem setup node 4 [0000004080000000-000000507fffffff] [ 0.000000] NODE_DATA [0x0000507ffbb000-0x0000507ffbffff] [ 0.000000] Initmem setup node 5 [0000005080000000-000000607fffffff] [ 0.000000] NODE_DATA [0x0000607ffbb000-0x0000607ffbffff] [ 0.000000] Initmem setup node 6 [0000006080000000-000000707fffffff] [ 0.000000] NODE_DATA [0x0000707ffbb000-0x0000707ffbffff] [ 0.000000] Initmem setup node 7 [0000007080000000-000000807fffffff] [ 0.000000] NODE_DATA [0x0000807ffba000-0x0000807ffbefff] Signed-off-by: Yinghai Lu LKML-Reference: <4D1933D1.9020609@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/page_types.h | 2 ++ arch/x86/kernel/setup.c | 8 ----- arch/x86/mm/amdtopology_64.c | 8 +++-- arch/x86/mm/init.c | 8 +---- arch/x86/mm/init_64.c | 54 +++++++++++++++++++++++++++++++ arch/x86/mm/numa_64.c | 6 ++-- arch/x86/mm/srat_64.c | 2 ++ 7 files changed, 68 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 93626e699679..731d211a1b20 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -54,6 +54,8 @@ static inline phys_addr_t get_max_mapped(void) extern unsigned long init_memory_mapping(unsigned long start, unsigned long end); +void init_memory_mapping_high(void); + extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn, int acpi, int k8); extern void free_initmem(void); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3def8c9a5dc9..fc0fe743f3a1 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -931,14 +931,6 @@ void __init setup_arch(char **cmdline_p) max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< max_low_pfn) { - max_pfn_mapped = init_memory_mapping(1UL<<32, - max_pfn<> PAGE_SHIFT, nodes[i].end >> PAGE_SHIFT); + init_memory_mapping_high(); + for_each_node_mask(i, node_possible_map) { + int j; + for (j = apicid_base; j < cores + apicid_base; j++) apicid_to_node[(i << bits) + j] = i; setup_node_bootmem(i, nodes[i].start, nodes[i].end); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 5863950ebe0c..fa6fe756d912 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -65,16 +65,10 @@ static void __init find_early_table_space(unsigned long end, int use_pse, #ifdef CONFIG_X86_32 /* for fixmap */ tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); -#endif - /* - * RED-PEN putting page tables only on node 0 could - * cause a hotspot and fill up ZONE_DMA. The page tables - * need roughly 0.5KB per GB. - */ -#ifdef CONFIG_X86_32 good_end = max_pfn_mapped << PAGE_SHIFT; #endif + base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); if (base == MEMBLOCK_ERROR) panic("Cannot find space for the kernel page tables"); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 024847dc81ab..194f2732ab77 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -607,9 +607,63 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, int acpi, int k8) { memblock_x86_register_active_regions(0, start_pfn, end_pfn); + init_memory_mapping_high(); } #endif +struct mapping_work_data { + unsigned long start; + unsigned long end; + unsigned long pfn_mapped; +}; + +static int __init_refok +mapping_work_fn(unsigned long start_pfn, unsigned long end_pfn, void *datax) +{ + struct mapping_work_data *data = datax; + unsigned long pfn_mapped; + unsigned long final_start, final_end; + + final_start = max_t(unsigned long, start_pfn<start); + final_end = min_t(unsigned long, end_pfn<end); + + if (final_end <= final_start) + return 0; + + pfn_mapped = init_memory_mapping(final_start, final_end); + + if (pfn_mapped > data->pfn_mapped) + data->pfn_mapped = pfn_mapped; + + return 0; +} + +static unsigned long __init_refok +init_memory_mapping_active_regions(unsigned long start, unsigned long end) +{ + struct mapping_work_data data; + + data.start = start; + data.end = end; + data.pfn_mapped = 0; + + work_with_active_regions(MAX_NUMNODES, mapping_work_fn, &data); + + return data.pfn_mapped; +} + +void __init_refok init_memory_mapping_high(void) +{ + if (max_pfn > max_low_pfn) { + max_pfn_mapped = init_memory_mapping_active_regions(1UL<<32, + max_pfn<> PAGE_SHIFT, nodes[i].end >> PAGE_SHIFT); + init_memory_mapping_high(); + for_each_node_mask(i, node_possible_map) setup_node_bootmem(i, nodes[i].start, nodes[i].end); - } acpi_fake_nodes(nodes, num_nodes); numa_init_array(); return 0; @@ -645,6 +646,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, for (i = 0; i < nr_cpu_ids; i++) numa_set_node(i, 0); memblock_x86_register_active_regions(0, start_pfn, last_pfn); + init_memory_mapping_high(); setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT); } diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index a35cb9d8b060..0b961c8bffb4 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -433,6 +433,8 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) return -1; } + init_memory_mapping_high(); + /* Account for nodes with cpus and no memory */ nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed); From f005fe12b90c5b9fe180a09209a893e09affa8aa Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 27 Dec 2010 16:48:32 -0800 Subject: [PATCH 007/706] x86-64: Move out cleanup higmap [_brk_end, _end) out of init_memory_mapping() It is not related to init_memory_mapping(), and init_memory_mapping() is getting more bigger. So make it as seperated function and call it from reserve_brk() and that is point when _brk_end is concluded. Signed-off-by: Yinghai Lu LKML-Reference: <4D1933E0.7090305@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 24 ++++++++++++++++++++++++ arch/x86/mm/init.c | 19 ------------------- 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index fc0fe743f3a1..635185ba4435 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -293,10 +293,32 @@ static void __init init_gbpages(void) else direct_gbpages = 0; } + +static void __init cleanup_highmap_brk_end(void) +{ + pud_t *pud; + pmd_t *pmd; + + mmu_cr4_features = read_cr4(); + + /* + * _brk_end cannot change anymore, but it and _end may be + * located on different 2M pages. cleanup_highmap(), however, + * can only consider _end when it runs, so destroy any + * mappings beyond _brk_end here. + */ + pud = pud_offset(pgd_offset_k(_brk_end), _brk_end); + pmd = pmd_offset(pud, _brk_end - 1); + while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1)) + pmd_clear(pmd); +} #else static inline void init_gbpages(void) { } +static inline void cleanup_highmap_brk_end(void) +{ +} #endif static void __init reserve_brk(void) @@ -307,6 +329,8 @@ static void __init reserve_brk(void) /* Mark brk area as locked down and no longer taking any new allocations */ _brk_start = 0; + + cleanup_highmap_brk_end(); } #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index fa6fe756d912..35ee75d9061a 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -270,25 +270,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, load_cr3(swapper_pg_dir); #endif -#ifdef CONFIG_X86_64 - if (!after_bootmem && !start) { - pud_t *pud; - pmd_t *pmd; - - mmu_cr4_features = read_cr4(); - - /* - * _brk_end cannot change anymore, but it and _end may be - * located on different 2M pages. cleanup_highmap(), however, - * can only consider _end when it runs, so destroy any - * mappings beyond _brk_end here. - */ - pud = pud_offset(pgd_offset_k(_brk_end), _brk_end); - pmd = pmd_offset(pud, _brk_end - 1); - while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1)) - pmd_clear(pmd); - } -#endif __flush_tlb_all(); if (!after_bootmem && e820_table_end > e820_table_start) From c9e358dfc4a8cb2227172ef77908c2e0ee17bcb9 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Fri, 21 Jan 2011 09:24:48 -0700 Subject: [PATCH 008/706] driver-core: remove conditionals around devicetree pointers Having conditional around the of_match_table and the of_node pointers turns out to make driver code use ugly #ifdef blocks. Drop the conditionals and remove the #ifdef blocks from the affected drivers. Also tidy up minor whitespace issues within the same hunks. Signed-off-by: Grant Likely Acked-by: Greg Kroah-Hartman --- drivers/i2c/busses/i2c-ocores.c | 14 +++----------- drivers/i2c/i2c-core.c | 2 -- drivers/mmc/host/mmc_spi.c | 4 ---- drivers/net/ethoc.c | 8 +------- drivers/spi/pxa2xx_spi.c | 2 -- drivers/spi/pxa2xx_spi_pci.c | 2 -- drivers/spi/xilinx_spi.c | 6 ------ include/linux/device.h | 7 ++----- include/linux/i2c.h | 2 -- include/linux/of.h | 4 ++-- 10 files changed, 8 insertions(+), 43 deletions(-) diff --git a/drivers/i2c/busses/i2c-ocores.c b/drivers/i2c/busses/i2c-ocores.c index ef3bcb1ce864..c2ef5ce1f1ff 100644 --- a/drivers/i2c/busses/i2c-ocores.c +++ b/drivers/i2c/busses/i2c-ocores.c @@ -330,9 +330,7 @@ static int __devinit ocores_i2c_probe(struct platform_device *pdev) i2c->adap = ocores_adapter; i2c_set_adapdata(&i2c->adap, i2c); i2c->adap.dev.parent = &pdev->dev; -#ifdef CONFIG_OF i2c->adap.dev.of_node = pdev->dev.of_node; -#endif /* add i2c adapter to i2c tree */ ret = i2c_add_adapter(&i2c->adap); @@ -390,15 +388,11 @@ static int ocores_i2c_resume(struct platform_device *pdev) #define ocores_i2c_resume NULL #endif -#ifdef CONFIG_OF static struct of_device_id ocores_i2c_match[] = { - { - .compatible = "opencores,i2c-ocores", - }, - {}, + { .compatible = "opencores,i2c-ocores", }, + {}, }; MODULE_DEVICE_TABLE(of, ocores_i2c_match); -#endif /* work with hotplug and coldplug */ MODULE_ALIAS("platform:ocores-i2c"); @@ -411,9 +405,7 @@ static struct platform_driver ocores_i2c_driver = { .driver = { .owner = THIS_MODULE, .name = "ocores-i2c", -#ifdef CONFIG_OF - .of_match_table = ocores_i2c_match, -#endif + .of_match_table = ocores_i2c_match, }, }; diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c index f0bd5bcdf563..045ba6efea48 100644 --- a/drivers/i2c/i2c-core.c +++ b/drivers/i2c/i2c-core.c @@ -537,9 +537,7 @@ i2c_new_device(struct i2c_adapter *adap, struct i2c_board_info const *info) client->dev.parent = &client->adapter->dev; client->dev.bus = &i2c_bus_type; client->dev.type = &i2c_client_type; -#ifdef CONFIG_OF client->dev.of_node = info->of_node; -#endif dev_set_name(&client->dev, "%d-%04x", i2c_adapter_id(adap), client->addr); diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c index fd877f633dd2..2f7fc0c5146f 100644 --- a/drivers/mmc/host/mmc_spi.c +++ b/drivers/mmc/host/mmc_spi.c @@ -1516,21 +1516,17 @@ static int __devexit mmc_spi_remove(struct spi_device *spi) return 0; } -#if defined(CONFIG_OF) static struct of_device_id mmc_spi_of_match_table[] __devinitdata = { { .compatible = "mmc-spi-slot", }, {}, }; -#endif static struct spi_driver mmc_spi_driver = { .driver = { .name = "mmc_spi", .bus = &spi_bus_type, .owner = THIS_MODULE, -#if defined(CONFIG_OF) .of_match_table = mmc_spi_of_match_table, -#endif }, .probe = mmc_spi_probe, .remove = __devexit_p(mmc_spi_remove), diff --git a/drivers/net/ethoc.c b/drivers/net/ethoc.c index b79d7e1555d5..db0290f05bdf 100644 --- a/drivers/net/ethoc.c +++ b/drivers/net/ethoc.c @@ -1163,15 +1163,11 @@ static int ethoc_resume(struct platform_device *pdev) # define ethoc_resume NULL #endif -#ifdef CONFIG_OF static struct of_device_id ethoc_match[] = { - { - .compatible = "opencores,ethoc", - }, + { .compatible = "opencores,ethoc", }, {}, }; MODULE_DEVICE_TABLE(of, ethoc_match); -#endif static struct platform_driver ethoc_driver = { .probe = ethoc_probe, @@ -1181,9 +1177,7 @@ static struct platform_driver ethoc_driver = { .driver = { .name = "ethoc", .owner = THIS_MODULE, -#ifdef CONFIG_OF .of_match_table = ethoc_match, -#endif }, }; diff --git a/drivers/spi/pxa2xx_spi.c b/drivers/spi/pxa2xx_spi.c index 95928833855b..a429b01d0285 100644 --- a/drivers/spi/pxa2xx_spi.c +++ b/drivers/spi/pxa2xx_spi.c @@ -1557,9 +1557,7 @@ static int __devinit pxa2xx_spi_probe(struct platform_device *pdev) drv_data->ssp = ssp; master->dev.parent = &pdev->dev; -#ifdef CONFIG_OF master->dev.of_node = pdev->dev.of_node; -#endif /* the spi->mode bits understood by this driver: */ master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH; diff --git a/drivers/spi/pxa2xx_spi_pci.c b/drivers/spi/pxa2xx_spi_pci.c index 351d8a375b57..b6589bb3a6c3 100644 --- a/drivers/spi/pxa2xx_spi_pci.c +++ b/drivers/spi/pxa2xx_spi_pci.c @@ -98,9 +98,7 @@ static int __devinit ce4100_spi_probe(struct pci_dev *dev, pdev->dev.parent = &dev->dev; pdev->dev.platform_data = &spi_info->spi_pdata; -#ifdef CONFIG_OF pdev->dev.of_node = dev->dev.of_node; -#endif pdev->dev.release = plat_dev_release; spi_pdata->num_chipselect = dev->devfn; diff --git a/drivers/spi/xilinx_spi.c b/drivers/spi/xilinx_spi.c index 7adaef62a991..4d2c75df886c 100644 --- a/drivers/spi/xilinx_spi.c +++ b/drivers/spi/xilinx_spi.c @@ -351,14 +351,12 @@ static irqreturn_t xilinx_spi_irq(int irq, void *dev_id) return IRQ_HANDLED; } -#ifdef CONFIG_OF static const struct of_device_id xilinx_spi_of_match[] = { { .compatible = "xlnx,xps-spi-2.00.a", }, { .compatible = "xlnx,xps-spi-2.00.b", }, {} }; MODULE_DEVICE_TABLE(of, xilinx_spi_of_match); -#endif struct spi_master *xilinx_spi_init(struct device *dev, struct resource *mem, u32 irq, s16 bus_num, int num_cs, int little_endian, int bits_per_word) @@ -394,9 +392,7 @@ struct spi_master *xilinx_spi_init(struct device *dev, struct resource *mem, master->bus_num = bus_num; master->num_chipselect = num_cs; -#ifdef CONFIG_OF master->dev.of_node = dev->of_node; -#endif xspi->mem = *mem; xspi->irq = irq; @@ -539,9 +535,7 @@ static struct platform_driver xilinx_spi_driver = { .driver = { .name = XILINX_SPI_NAME, .owner = THIS_MODULE, -#ifdef CONFIG_OF .of_match_table = xilinx_spi_of_match, -#endif }, }; diff --git a/include/linux/device.h b/include/linux/device.h index 1bf5cf0b4513..ca5d25225aab 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -128,9 +128,7 @@ struct device_driver { bool suppress_bind_attrs; /* disables bind/unbind via sysfs */ -#if defined(CONFIG_OF) const struct of_device_id *of_match_table; -#endif int (*probe) (struct device *dev); int (*remove) (struct device *dev); @@ -441,9 +439,8 @@ struct device { override */ /* arch specific additions */ struct dev_archdata archdata; -#ifdef CONFIG_OF - struct device_node *of_node; -#endif + + struct device_node *of_node; /* associated device tree node */ dev_t devt; /* dev_t, creates the sysfs "dev" */ diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 903576df88dc..06a8d9c7de98 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -258,9 +258,7 @@ struct i2c_board_info { unsigned short addr; void *platform_data; struct dev_archdata *archdata; -#ifdef CONFIG_OF struct device_node *of_node; -#endif int irq; }; diff --git a/include/linux/of.h b/include/linux/of.h index cad7cf0ab278..d9dd664a6a9c 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -23,8 +23,6 @@ #include -#ifdef CONFIG_OF - typedef u32 phandle; typedef u32 ihandle; @@ -65,6 +63,8 @@ struct device_node { #endif }; +#ifdef CONFIG_OF + /* Pointer for first entry in chain of all nodes. */ extern struct device_node *allnodes; extern struct device_node *of_chosen; From cd7eab44e9946c28d595abe3e9a43e945bc49141 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Wed, 19 Jan 2011 21:01:44 +0000 Subject: [PATCH 009/706] genirq: Add IRQ affinity notifiers When initiating I/O on a multiqueue and multi-IRQ device, we may want to select a queue for which the response will be handled on the same or a nearby CPU. This requires a reverse-map of IRQ affinity. Add a notification mechanism to support this. This is based closely on work by Thomas Gleixner . Signed-off-by: Ben Hutchings Cc: linux-net-drivers@solarflare.com Cc: Tom Herbert Cc: David Miller LKML-Reference: <1295470904.11126.84.camel@bwh-desktop> Signed-off-by: Thomas Gleixner --- include/linux/interrupt.h | 33 +++++++++++++++- include/linux/irqdesc.h | 3 ++ kernel/irq/manage.c | 82 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 117 insertions(+), 1 deletion(-) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 55e0d4253e49..63c5ad78e37c 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include #include @@ -240,6 +242,35 @@ extern int irq_can_set_affinity(unsigned int irq); extern int irq_select_affinity(unsigned int irq); extern int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m); + +/** + * struct irq_affinity_notify - context for notification of IRQ affinity changes + * @irq: Interrupt to which notification applies + * @kref: Reference count, for internal use + * @work: Work item, for internal use + * @notify: Function to be called on change. This will be + * called in process context. + * @release: Function to be called on release. This will be + * called in process context. Once registered, the + * structure must only be freed when this function is + * called or later. + */ +struct irq_affinity_notify { + unsigned int irq; + struct kref kref; + struct work_struct work; + void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask); + void (*release)(struct kref *ref); +}; + +extern int +irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify); + +static inline void irq_run_affinity_notifiers(void) +{ + flush_scheduled_work(); +} + #else /* CONFIG_SMP */ static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m) @@ -255,7 +286,7 @@ static inline int irq_can_set_affinity(unsigned int irq) static inline int irq_select_affinity(unsigned int irq) { return 0; } static inline int irq_set_affinity_hint(unsigned int irq, - const struct cpumask *m) + const struct cpumask *m) { return -EINVAL; } diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index c1a95b7b58de..bfef56dadddb 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -8,6 +8,7 @@ * For now it's included from */ +struct irq_affinity_notify; struct proc_dir_entry; struct timer_rand_state; /** @@ -24,6 +25,7 @@ struct timer_rand_state; * @last_unhandled: aging timer for unhandled count * @irqs_unhandled: stats field for spurious unhandled interrupts * @lock: locking for SMP + * @affinity_notify: context for notification of affinity changes * @pending_mask: pending rebalanced interrupts * @threads_active: number of irqaction threads currently running * @wait_for_threads: wait queue for sync_irq to wait for threaded handlers @@ -70,6 +72,7 @@ struct irq_desc { raw_spinlock_t lock; #ifdef CONFIG_SMP const struct cpumask *affinity_hint; + struct irq_affinity_notify *affinity_notify; #ifdef CONFIG_GENERIC_PENDING_IRQ cpumask_var_t pending_mask; #endif diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0caa59f747dd..0587c5ceaed8 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -134,6 +134,10 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) irq_set_thread_affinity(desc); } #endif + if (desc->affinity_notify) { + kref_get(&desc->affinity_notify->kref); + schedule_work(&desc->affinity_notify->work); + } desc->status |= IRQ_AFFINITY_SET; raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; @@ -155,6 +159,79 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) } EXPORT_SYMBOL_GPL(irq_set_affinity_hint); +static void irq_affinity_notify(struct work_struct *work) +{ + struct irq_affinity_notify *notify = + container_of(work, struct irq_affinity_notify, work); + struct irq_desc *desc = irq_to_desc(notify->irq); + cpumask_var_t cpumask; + unsigned long flags; + + if (!desc) + goto out; + + if (!alloc_cpumask_var(&cpumask, GFP_KERNEL)) + goto out; + + raw_spin_lock_irqsave(&desc->lock, flags); +#ifdef CONFIG_GENERIC_PENDING_IRQ + if (desc->status & IRQ_MOVE_PENDING) + cpumask_copy(cpumask, desc->pending_mask); + else +#endif + cpumask_copy(cpumask, desc->affinity); + raw_spin_unlock_irqrestore(&desc->lock, flags); + + notify->notify(notify, cpumask); + + free_cpumask_var(cpumask); +out: + kref_put(¬ify->kref, notify->release); +} + +/** + * irq_set_affinity_notifier - control notification of IRQ affinity changes + * @irq: Interrupt for which to enable/disable notification + * @notify: Context for notification, or %NULL to disable + * notification. Function pointers must be initialised; + * the other fields will be initialised by this function. + * + * Must be called in process context. Notification may only be enabled + * after the IRQ is allocated and must be disabled before the IRQ is + * freed using free_irq(). + */ +int +irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct irq_affinity_notify *old_notify; + unsigned long flags; + + /* The release function is promised process context */ + might_sleep(); + + if (!desc) + return -EINVAL; + + /* Complete initialisation of *notify */ + if (notify) { + notify->irq = irq; + kref_init(¬ify->kref); + INIT_WORK(¬ify->work, irq_affinity_notify); + } + + raw_spin_lock_irqsave(&desc->lock, flags); + old_notify = desc->affinity_notify; + desc->affinity_notify = notify; + raw_spin_unlock_irqrestore(&desc->lock, flags); + + if (old_notify) + kref_put(&old_notify->kref, old_notify->release); + + return 0; +} +EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); + #ifndef CONFIG_AUTO_IRQ_AFFINITY /* * Generic version of the affinity autoselector. @@ -1004,6 +1081,11 @@ void free_irq(unsigned int irq, void *dev_id) if (!desc) return; +#ifdef CONFIG_SMP + if (WARN_ON(desc->affinity_notify)) + desc->affinity_notify = NULL; +#endif + chip_bus_lock(desc); kfree(__free_irq(irq, dev_id)); chip_bus_sync_unlock(desc); From 361c99a661a78ed22264649440e87fe4fe8da1f2 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Jan 2011 20:56:53 -0200 Subject: [PATCH 010/706] perf evsel: Introduce perf_evlist Killing two more perf wide global variables: nr_counters and evsel_list as a list_head. There are more operations that will need more fields in perf_evlist, like the pollfd for polling all the fds in a list of evsel instances. Use option->value to pass the evsel_list to parse_{events,filters}. LKML-Reference: Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile | 2 + tools/perf/builtin-record.c | 47 +++++++++++++--------- tools/perf/builtin-stat.c | 34 +++++++++------- tools/perf/builtin-top.c | 60 ++++++++++++++++------------ tools/perf/util/evlist.c | 53 ++++++++++++++++++++++++ tools/perf/util/evlist.h | 19 +++++++++ tools/perf/util/header.c | 17 ++++---- tools/perf/util/header.h | 7 +++- tools/perf/util/include/linux/list.h | 1 + tools/perf/util/parse-events.c | 51 +++++------------------ tools/perf/util/parse-events.h | 7 ---- 11 files changed, 180 insertions(+), 118 deletions(-) create mode 100644 tools/perf/util/evlist.c create mode 100644 tools/perf/util/evlist.h diff --git a/tools/perf/Makefile b/tools/perf/Makefile index 7141c42e1469..f20bc6f85611 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -402,6 +402,7 @@ LIB_H += util/debug.h LIB_H += util/debugfs.h LIB_H += util/event.h LIB_H += util/evsel.h +LIB_H += util/evlist.h LIB_H += util/exec_cmd.h LIB_H += util/types.h LIB_H += util/levenshtein.h @@ -440,6 +441,7 @@ LIB_OBJS += $(OUTPUT)util/ctype.o LIB_OBJS += $(OUTPUT)util/debugfs.o LIB_OBJS += $(OUTPUT)util/environment.o LIB_OBJS += $(OUTPUT)util/event.o +LIB_OBJS += $(OUTPUT)util/evlist.o LIB_OBJS += $(OUTPUT)util/evsel.o LIB_OBJS += $(OUTPUT)util/exec_cmd.o LIB_OBJS += $(OUTPUT)util/help.o diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index b2f729fdb317..252ace873d32 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -18,6 +18,7 @@ #include "util/header.h" #include "util/event.h" +#include "util/evlist.h" #include "util/evsel.h" #include "util/debug.h" #include "util/session.h" @@ -66,6 +67,7 @@ static bool sample_address = false; static bool sample_time = false; static bool no_buildid = false; static bool no_buildid_cache = false; +static struct perf_evlist *evsel_list; static long samples = 0; static u64 bytes_written = 0; @@ -229,7 +231,8 @@ static struct perf_header_attr *get_header_attr(struct perf_event_attr *a, int n return h_attr; } -static void create_counter(struct perf_evsel *evsel, int cpu) +static void create_counter(struct perf_evlist *evlist, + struct perf_evsel *evsel, int cpu) { char *filter = evsel->filter; struct perf_event_attr *attr = &evsel->attr; @@ -263,7 +266,7 @@ static void create_counter(struct perf_evsel *evsel, int cpu) attr->sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID; - if (nr_counters > 1) + if (evlist->nr_entries > 1) attr->sample_type |= PERF_SAMPLE_ID; /* @@ -410,7 +413,7 @@ try_again: if (evsel->idx || thread_index) { struct perf_evsel *first; - first = list_entry(evsel_list.next, struct perf_evsel, node); + first = list_entry(evlist->entries.next, struct perf_evsel, node); ret = ioctl(FD(evsel, nr_cpu, thread_index), PERF_EVENT_IOC_SET_OUTPUT, FD(first, nr_cpu, 0)); @@ -449,14 +452,14 @@ try_again: sample_type = attr->sample_type; } -static void open_counters(int cpu) +static void open_counters(struct perf_evlist *evlist, int cpu) { struct perf_evsel *pos; group_fd = -1; - list_for_each_entry(pos, &evsel_list, node) - create_counter(pos, cpu); + list_for_each_entry(pos, &evlist->entries, node) + create_counter(evlist, pos, cpu); nr_cpu++; } @@ -481,9 +484,9 @@ static void atexit_header(void) if (!no_buildid) process_buildids(); - perf_header__write(&session->header, output, true); + perf_header__write(&session->header, evsel_list, output, true); perf_session__delete(session); - perf_evsel_list__delete(); + perf_evlist__delete(evsel_list); symbol__exit(); } } @@ -611,7 +614,7 @@ static int __cmd_record(int argc, const char **argv) goto out_delete_session; } - if (have_tracepoints(&evsel_list)) + if (have_tracepoints(&evsel_list->entries)) perf_header__set_feat(&session->header, HEADER_TRACE_INFO); /* @@ -674,10 +677,10 @@ static int __cmd_record(int argc, const char **argv) } if (!system_wide && no_inherit && !cpu_list) { - open_counters(-1); + open_counters(evsel_list, -1); } else { for (i = 0; i < cpus->nr; i++) - open_counters(cpus->map[i]); + open_counters(evsel_list, cpus->map[i]); } perf_session__set_sample_type(session, sample_type); @@ -687,7 +690,8 @@ static int __cmd_record(int argc, const char **argv) if (err < 0) return err; } else if (file_new) { - err = perf_header__write(&session->header, output, false); + err = perf_header__write(&session->header, evsel_list, + output, false); if (err < 0) return err; } @@ -712,7 +716,7 @@ static int __cmd_record(int argc, const char **argv) return err; } - if (have_tracepoints(&evsel_list)) { + if (have_tracepoints(&evsel_list->entries)) { /* * FIXME err <= 0 here actually means that * there were no tracepoints so its not really @@ -721,7 +725,7 @@ static int __cmd_record(int argc, const char **argv) * return this more properly and also * propagate errors that now are calling die() */ - err = event__synthesize_tracing_data(output, &evsel_list, + err = event__synthesize_tracing_data(output, evsel_list, process_synthesized_event, session); if (err <= 0) { @@ -797,7 +801,7 @@ static int __cmd_record(int argc, const char **argv) for (i = 0; i < nr_cpu; i++) { struct perf_evsel *pos; - list_for_each_entry(pos, &evsel_list, node) { + list_for_each_entry(pos, &evsel_list->entries, node) { for (thread = 0; thread < threads->nr; thread++) @@ -838,10 +842,10 @@ static const char * const record_usage[] = { static bool force, append_file; const struct option record_options[] = { - OPT_CALLBACK('e', "event", NULL, "event", + OPT_CALLBACK('e', "event", &evsel_list, "event", "event selector. use 'perf list' to list available events", parse_events), - OPT_CALLBACK(0, "filter", NULL, "filter", + OPT_CALLBACK(0, "filter", &evsel_list, "filter", "event filter", parse_filter), OPT_INTEGER('p', "pid", &target_pid, "record events on existing process id"), @@ -892,6 +896,10 @@ int cmd_record(int argc, const char **argv, const char *prefix __used) int err = -ENOMEM; struct perf_evsel *pos; + evsel_list = perf_evlist__new(); + if (evsel_list == NULL) + return -ENOMEM; + argc = parse_options(argc, argv, record_options, record_usage, PARSE_OPT_STOP_AT_NON_OPTION); if (!argc && target_pid == -1 && target_tid == -1 && @@ -913,7 +921,8 @@ int cmd_record(int argc, const char **argv, const char *prefix __used) if (no_buildid_cache || no_buildid) disable_buildid_cache(); - if (list_empty(&evsel_list) && perf_evsel_list__create_default() < 0) { + if (evsel_list->nr_entries == 0 && + perf_evlist__add_default(evsel_list) < 0) { pr_err("Not enough memory for event selector list\n"); goto out_symbol_exit; } @@ -933,7 +942,7 @@ int cmd_record(int argc, const char **argv, const char *prefix __used) return -1; } - list_for_each_entry(pos, &evsel_list, node) { + list_for_each_entry(pos, &evsel_list->entries, node) { if (perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0) goto out_free_fd; if (perf_header__push_event(pos->attr.config, event_name(pos))) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index a482a191a0ca..da9090245934 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -43,6 +43,7 @@ #include "util/parse-options.h" #include "util/parse-events.h" #include "util/event.h" +#include "util/evlist.h" #include "util/evsel.h" #include "util/debug.h" #include "util/header.h" @@ -71,6 +72,8 @@ static struct perf_event_attr default_attrs[] = { }; +struct perf_evlist *evsel_list; + static bool system_wide = false; static struct cpu_map *cpus; static int run_idx = 0; @@ -309,7 +312,7 @@ static int run_perf_stat(int argc __used, const char **argv) close(child_ready_pipe[0]); } - list_for_each_entry(counter, &evsel_list, node) { + list_for_each_entry(counter, &evsel_list->entries, node) { if (create_perf_stat_counter(counter) < 0) { if (errno == -EPERM || errno == -EACCES) { error("You may not have permission to collect %sstats.\n" @@ -347,12 +350,12 @@ static int run_perf_stat(int argc __used, const char **argv) update_stats(&walltime_nsecs_stats, t1 - t0); if (no_aggr) { - list_for_each_entry(counter, &evsel_list, node) { + list_for_each_entry(counter, &evsel_list->entries, node) { read_counter(counter); perf_evsel__close_fd(counter, cpus->nr, 1); } } else { - list_for_each_entry(counter, &evsel_list, node) { + list_for_each_entry(counter, &evsel_list->entries, node) { read_counter_aggr(counter); perf_evsel__close_fd(counter, cpus->nr, threads->nr); } @@ -555,10 +558,10 @@ static void print_stat(int argc, const char **argv) } if (no_aggr) { - list_for_each_entry(counter, &evsel_list, node) + list_for_each_entry(counter, &evsel_list->entries, node) print_counter(counter); } else { - list_for_each_entry(counter, &evsel_list, node) + list_for_each_entry(counter, &evsel_list->entries, node) print_counter_aggr(counter); } @@ -610,7 +613,7 @@ static int stat__set_big_num(const struct option *opt __used, } static const struct option options[] = { - OPT_CALLBACK('e', "event", NULL, "event", + OPT_CALLBACK('e', "event", &evsel_list, "event", "event selector. use 'perf list' to list available events", parse_events), OPT_BOOLEAN('i', "no-inherit", &no_inherit, @@ -648,6 +651,10 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used) setlocale(LC_ALL, ""); + evsel_list = perf_evlist__new(); + if (evsel_list == NULL) + return -ENOMEM; + argc = parse_options(argc, argv, options, stat_usage, PARSE_OPT_STOP_AT_NON_OPTION); @@ -679,17 +686,14 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used) usage_with_options(stat_usage, options); /* Set attrs and nr_counters if no event is selected and !null_run */ - if (!null_run && !nr_counters) { + if (!null_run && !evsel_list->nr_entries) { size_t c; - nr_counters = ARRAY_SIZE(default_attrs); - for (c = 0; c < ARRAY_SIZE(default_attrs); ++c) { - pos = perf_evsel__new(&default_attrs[c], - nr_counters); + pos = perf_evsel__new(&default_attrs[c], c); if (pos == NULL) goto out; - list_add(&pos->node, &evsel_list); + perf_evlist__add(evsel_list, pos); } } @@ -713,7 +717,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used) return -1; } - list_for_each_entry(pos, &evsel_list, node) { + list_for_each_entry(pos, &evsel_list->entries, node) { if (perf_evsel__alloc_stat_priv(pos) < 0 || perf_evsel__alloc_counts(pos, cpus->nr) < 0 || perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0) @@ -741,9 +745,9 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used) if (status != -1) print_stat(argc, argv); out_free_fd: - list_for_each_entry(pos, &evsel_list, node) + list_for_each_entry(pos, &evsel_list->entries, node) perf_evsel__free_stat_priv(pos); - perf_evsel_list__delete(); + perf_evlist__delete(evsel_list); out: thread_map__delete(threads); threads = NULL; diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index b6998e055767..216b62ed4b89 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -21,6 +21,7 @@ #include "perf.h" #include "util/color.h" +#include "util/evlist.h" #include "util/evsel.h" #include "util/session.h" #include "util/symbol.h" @@ -60,6 +61,8 @@ #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y)) +struct perf_evlist *evsel_list; + static bool system_wide = false; static int default_interval = 0; @@ -267,7 +270,7 @@ static void __zero_source_counters(struct sym_entry *syme) line = syme->src->lines; while (line) { - for (i = 0; i < nr_counters; i++) + for (i = 0; i < evsel_list->nr_entries; i++) line->count[i] = 0; line = line->next; } @@ -414,7 +417,7 @@ static double sym_weight(const struct sym_entry *sym) if (!display_weighted) return weight; - for (counter = 1; counter < nr_counters-1; counter++) + for (counter = 1; counter < evsel_list->nr_entries - 1; counter++) weight *= sym->count[counter]; weight /= (sym->count[counter] + 1); @@ -501,7 +504,7 @@ static void print_sym_table(void) rb_insert_active_sym(&tmp, syme); sum_ksamples += syme->snap_count; - for (j = 0; j < nr_counters; j++) + for (j = 0; j < evsel_list->nr_entries; j++) syme->count[j] = zero ? 0 : syme->count[j] * 7 / 8; } else list_remove_active_sym(syme); @@ -535,9 +538,9 @@ static void print_sym_table(void) esamples_percent); } - if (nr_counters == 1 || !display_weighted) { + if (evsel_list->nr_entries == 1 || !display_weighted) { struct perf_evsel *first; - first = list_entry(evsel_list.next, struct perf_evsel, node); + first = list_entry(evsel_list->entries.next, struct perf_evsel, node); printf("%" PRIu64, (uint64_t)first->attr.sample_period); if (freq) printf("Hz "); @@ -547,7 +550,7 @@ static void print_sym_table(void) if (!display_weighted) printf("%s", event_name(sym_evsel)); - else list_for_each_entry(counter, &evsel_list, node) { + else list_for_each_entry(counter, &evsel_list->entries, node) { if (counter->idx) printf("/"); @@ -606,7 +609,7 @@ static void print_sym_table(void) sym_width = winsize.ws_col - dso_width - 29; } putchar('\n'); - if (nr_counters == 1) + if (evsel_list->nr_entries == 1) printf(" samples pcnt"); else printf(" weight samples pcnt"); @@ -615,7 +618,7 @@ static void print_sym_table(void) printf(" RIP "); printf(" %-*.*s DSO\n", sym_width, sym_width, "function"); printf(" %s _______ _____", - nr_counters == 1 ? " " : "______"); + evsel_list->nr_entries == 1 ? " " : "______"); if (verbose) printf(" ________________"); printf(" %-*.*s", sym_width, sym_width, graph_line); @@ -634,7 +637,7 @@ static void print_sym_table(void) pcnt = 100.0 - (100.0 * ((sum_ksamples - syme->snap_count) / sum_ksamples)); - if (nr_counters == 1 || !display_weighted) + if (evsel_list->nr_entries == 1 || !display_weighted) printf("%20.2f ", syme->weight); else printf("%9.1f %10ld ", syme->weight, syme->snap_count); @@ -744,7 +747,7 @@ static void print_mapped_keys(void) fprintf(stdout, "\t[d] display refresh delay. \t(%d)\n", delay_secs); fprintf(stdout, "\t[e] display entries (lines). \t(%d)\n", print_entries); - if (nr_counters > 1) + if (evsel_list->nr_entries > 1) fprintf(stdout, "\t[E] active event counter. \t(%s)\n", event_name(sym_evsel)); fprintf(stdout, "\t[f] profile display filter (count). \t(%d)\n", count_filter); @@ -753,7 +756,7 @@ static void print_mapped_keys(void) fprintf(stdout, "\t[s] annotate symbol. \t(%s)\n", name?: "NULL"); fprintf(stdout, "\t[S] stop annotation.\n"); - if (nr_counters > 1) + if (evsel_list->nr_entries > 1) fprintf(stdout, "\t[w] toggle display weighted/count[E]r. \t(%d)\n", display_weighted ? 1 : 0); fprintf(stdout, @@ -783,7 +786,7 @@ static int key_mapped(int c) return 1; case 'E': case 'w': - return nr_counters > 1 ? 1 : 0; + return evsel_list->nr_entries > 1 ? 1 : 0; default: break; } @@ -831,22 +834,22 @@ static void handle_keypress(struct perf_session *session, int c) signal(SIGWINCH, SIG_DFL); break; case 'E': - if (nr_counters > 1) { + if (evsel_list->nr_entries > 1) { fprintf(stderr, "\nAvailable events:"); - list_for_each_entry(sym_evsel, &evsel_list, node) + list_for_each_entry(sym_evsel, &evsel_list->entries, node) fprintf(stderr, "\n\t%d %s", sym_evsel->idx, event_name(sym_evsel)); prompt_integer(&sym_counter, "Enter details event counter"); - if (sym_counter >= nr_counters) { - sym_evsel = list_entry(evsel_list.next, struct perf_evsel, node); + if (sym_counter >= evsel_list->nr_entries) { + sym_evsel = list_entry(evsel_list->entries.next, struct perf_evsel, node); sym_counter = 0; fprintf(stderr, "Sorry, no such event, using %s.\n", event_name(sym_evsel)); sleep(1); break; } - list_for_each_entry(sym_evsel, &evsel_list, node) + list_for_each_entry(sym_evsel, &evsel_list->entries, node) if (sym_evsel->idx == sym_counter) break; } else sym_counter = 0; @@ -1198,7 +1201,7 @@ static void perf_session__mmap_read(struct perf_session *self) int i, thread_index; for (i = 0; i < cpus->nr; i++) { - list_for_each_entry(counter, &evsel_list, node) { + list_for_each_entry(counter, &evsel_list->entries, node) { for (thread_index = 0; thread_index < threads->nr; thread_index++) { @@ -1312,7 +1315,7 @@ static int __cmd_top(void) for (i = 0; i < cpus->nr; i++) { group_fd = -1; - list_for_each_entry(counter, &evsel_list, node) + list_for_each_entry(counter, &evsel_list->entries, node) start_counter(i, counter); } @@ -1354,7 +1357,7 @@ static const char * const top_usage[] = { }; static const struct option options[] = { - OPT_CALLBACK('e', "event", NULL, "event", + OPT_CALLBACK('e', "event", &evsel_list, "event", "event selector. use 'perf list' to list available events", parse_events), OPT_INTEGER('c', "count", &default_interval, @@ -1404,6 +1407,10 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) struct perf_evsel *pos; int status = -ENOMEM; + evsel_list = perf_evlist__new(); + if (evsel_list == NULL) + return -ENOMEM; + page_size = sysconf(_SC_PAGE_SIZE); argc = parse_options(argc, argv, options, top_usage, 0); @@ -1431,7 +1438,8 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) cpu_list = NULL; } - if (!nr_counters && perf_evsel_list__create_default() < 0) { + if (!evsel_list->nr_entries && + perf_evlist__add_default(evsel_list) < 0) { pr_err("Not enough memory for event selector list\n"); return -ENOMEM; } @@ -1459,7 +1467,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) if (cpus == NULL) usage_with_options(top_usage, options); - list_for_each_entry(pos, &evsel_list, node) { + list_for_each_entry(pos, &evsel_list->entries, node) { if (perf_evsel__alloc_mmap_per_thread(pos, cpus->nr, threads->nr) < 0 || perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0) goto out_free_fd; @@ -1472,10 +1480,10 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) pos->attr.sample_period = default_interval; } - sym_evsel = list_entry(evsel_list.next, struct perf_evsel, node); + sym_evsel = list_entry(evsel_list->entries.next, struct perf_evsel, node); symbol_conf.priv_size = (sizeof(struct sym_entry) + - (nr_counters + 1) * sizeof(unsigned long)); + (evsel_list->nr_entries + 1) * sizeof(unsigned long)); symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL); if (symbol__init() < 0) @@ -1489,9 +1497,9 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) status = __cmd_top(); out_free_fd: - list_for_each_entry(pos, &evsel_list, node) + list_for_each_entry(pos, &evsel_list->entries, node) perf_evsel__free_mmap(pos); - perf_evsel_list__delete(); + perf_evlist__delete(evsel_list); return status; } diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c new file mode 100644 index 000000000000..7b4faec23737 --- /dev/null +++ b/tools/perf/util/evlist.c @@ -0,0 +1,53 @@ +#include "evlist.h" +#include "evsel.h" +#include "util.h" + +struct perf_evlist *perf_evlist__new(void) +{ + struct perf_evlist *evlist = zalloc(sizeof(*evlist)); + + if (evlist != NULL) { + INIT_LIST_HEAD(&evlist->entries); + } + + return evlist; +} + +static void perf_evlist__purge(struct perf_evlist *evlist) +{ + struct perf_evsel *pos, *n; + + list_for_each_entry_safe(pos, n, &evlist->entries, node) { + list_del_init(&pos->node); + perf_evsel__delete(pos); + } + + evlist->nr_entries = 0; +} + +void perf_evlist__delete(struct perf_evlist *evlist) +{ + perf_evlist__purge(evlist); + free(evlist); +} + +void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry) +{ + list_add_tail(&entry->node, &evlist->entries); + ++evlist->nr_entries; +} + +int perf_evlist__add_default(struct perf_evlist *evlist) +{ + struct perf_event_attr attr = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + }; + struct perf_evsel *evsel = perf_evsel__new(&attr, 0); + + if (evsel == NULL) + return -ENOMEM; + + perf_evlist__add(evlist, evsel); + return 0; +} diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h new file mode 100644 index 000000000000..48db91a8abf3 --- /dev/null +++ b/tools/perf/util/evlist.h @@ -0,0 +1,19 @@ +#ifndef __PERF_EVLIST_H +#define __PERF_EVLIST_H 1 + +#include + +struct perf_evlist { + struct list_head entries; + int nr_entries; +}; + +struct perf_evsel; + +struct perf_evlist *perf_evlist__new(void); +void perf_evlist__delete(struct perf_evlist *evlist); + +void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry); +int perf_evlist__add_default(struct perf_evlist *evlist); + +#endif /* __PERF_EVLIST_H */ diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index f6a929e74981..f0138d472339 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -8,6 +8,7 @@ #include #include +#include "evlist.h" #include "util.h" #include "header.h" #include "../perf.h" @@ -428,7 +429,8 @@ static bool perf_session__read_build_ids(struct perf_session *self, bool with_hi return ret; } -static int perf_header__adds_write(struct perf_header *self, int fd) +static int perf_header__adds_write(struct perf_header *self, + struct perf_evlist *evlist, int fd) { int nr_sections; struct perf_session *session; @@ -463,7 +465,7 @@ static int perf_header__adds_write(struct perf_header *self, int fd) /* Write trace info */ trace_sec->offset = lseek(fd, 0, SEEK_CUR); - read_tracing_data(fd, &evsel_list); + read_tracing_data(fd, &evlist->entries); trace_sec->size = lseek(fd, 0, SEEK_CUR) - trace_sec->offset; } @@ -513,7 +515,8 @@ int perf_header__write_pipe(int fd) return 0; } -int perf_header__write(struct perf_header *self, int fd, bool at_exit) +int perf_header__write(struct perf_header *self, struct perf_evlist *evlist, + int fd, bool at_exit) { struct perf_file_header f_header; struct perf_file_attr f_attr; @@ -566,7 +569,7 @@ int perf_header__write(struct perf_header *self, int fd, bool at_exit) self->data_offset = lseek(fd, 0, SEEK_CUR); if (at_exit) { - err = perf_header__adds_write(self, fd); + err = perf_header__adds_write(self, evlist, fd); if (err < 0) return err; } @@ -1133,7 +1136,7 @@ int event__process_event_type(event_t *self, return 0; } -int event__synthesize_tracing_data(int fd, struct list_head *pattrs, +int event__synthesize_tracing_data(int fd, struct perf_evlist *evlist, event__handler_t process, struct perf_session *session __unused) { @@ -1144,7 +1147,7 @@ int event__synthesize_tracing_data(int fd, struct list_head *pattrs, memset(&ev, 0, sizeof(ev)); ev.tracing_data.header.type = PERF_RECORD_HEADER_TRACING_DATA; - size = read_tracing_data_size(fd, pattrs); + size = read_tracing_data_size(fd, &evlist->entries); if (size <= 0) return size; aligned_size = ALIGN(size, sizeof(u64)); @@ -1154,7 +1157,7 @@ int event__synthesize_tracing_data(int fd, struct list_head *pattrs, process(&ev, NULL, session); - err = read_tracing_data(fd, pattrs); + err = read_tracing_data(fd, &evlist->entries); write_padded(fd, NULL, 0, padding); return aligned_size; diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index 33f16be7b72f..65afd7f74e0d 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -65,8 +65,11 @@ struct perf_header { int perf_header__init(struct perf_header *self); void perf_header__exit(struct perf_header *self); +struct perf_evlist; + int perf_header__read(struct perf_session *session, int fd); -int perf_header__write(struct perf_header *self, int fd, bool at_exit); +int perf_header__write(struct perf_header *self, struct perf_evlist *evlist, + int fd, bool at_exit); int perf_header__write_pipe(int fd); int perf_header__add_attr(struct perf_header *self, @@ -113,7 +116,7 @@ int event__synthesize_event_types(event__handler_t process, int event__process_event_type(event_t *self, struct perf_session *session); -int event__synthesize_tracing_data(int fd, struct list_head *pattrs, +int event__synthesize_tracing_data(int fd, struct perf_evlist *evlist, event__handler_t process, struct perf_session *session); int event__process_tracing_data(event_t *self, diff --git a/tools/perf/util/include/linux/list.h b/tools/perf/util/include/linux/list.h index f5ca26e53fbb..356c7e467b83 100644 --- a/tools/perf/util/include/linux/list.h +++ b/tools/perf/util/include/linux/list.h @@ -1,3 +1,4 @@ +#include #include "../../../../include/linux/list.h" #ifndef PERF_LIST_H diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 135f69baf966..d3086cecd2dd 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1,6 +1,7 @@ #include "../../../include/linux/hw_breakpoint.h" #include "util.h" #include "../perf.h" +#include "evlist.h" #include "evsel.h" #include "parse-options.h" #include "parse-events.h" @@ -11,10 +12,6 @@ #include "header.h" #include "debugfs.h" -int nr_counters; - -LIST_HEAD(evsel_list); - struct event_symbol { u8 type; u64 config; @@ -778,8 +775,9 @@ modifier: return ret; } -int parse_events(const struct option *opt __used, const char *str, int unset __used) +int parse_events(const struct option *opt, const char *str, int unset __used) { + struct perf_evlist *evlist = *(struct perf_evlist **)opt->value; struct perf_event_attr attr; enum event_result ret; @@ -794,12 +792,10 @@ int parse_events(const struct option *opt __used, const char *str, int unset __u if (ret != EVT_HANDLED_ALL) { struct perf_evsel *evsel; - evsel = perf_evsel__new(&attr, - nr_counters); + evsel = perf_evsel__new(&attr, evlist->nr_entries); if (evsel == NULL) return -1; - list_add_tail(&evsel->node, &evsel_list); - ++nr_counters; + perf_evlist__add(evlist, evsel); } if (*str == 0) @@ -813,13 +809,14 @@ int parse_events(const struct option *opt __used, const char *str, int unset __u return 0; } -int parse_filter(const struct option *opt __used, const char *str, +int parse_filter(const struct option *opt, const char *str, int unset __used) { + struct perf_evlist *evlist = *(struct perf_evlist **)opt->value; struct perf_evsel *last = NULL; - if (!list_empty(&evsel_list)) - last = list_entry(evsel_list.prev, struct perf_evsel, node); + if (evlist->nr_entries > 0) + last = list_entry(evlist->entries.prev, struct perf_evsel, node); if (last == NULL || last->attr.type != PERF_TYPE_TRACEPOINT) { fprintf(stderr, @@ -981,33 +978,3 @@ void print_events(void) exit(129); } - -int perf_evsel_list__create_default(void) -{ - struct perf_evsel *evsel; - struct perf_event_attr attr; - - memset(&attr, 0, sizeof(attr)); - attr.type = PERF_TYPE_HARDWARE; - attr.config = PERF_COUNT_HW_CPU_CYCLES; - - evsel = perf_evsel__new(&attr, 0); - - if (evsel == NULL) - return -ENOMEM; - - list_add(&evsel->node, &evsel_list); - ++nr_counters; - return 0; -} - -void perf_evsel_list__delete(void) -{ - struct perf_evsel *pos, *n; - - list_for_each_entry_safe(pos, n, &evsel_list, node) { - list_del_init(&pos->node); - perf_evsel__delete(pos); - } - nr_counters = 0; -} diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 458e3ecf17af..cf7e94abb676 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -9,11 +9,6 @@ struct list_head; struct perf_evsel; -extern struct list_head evsel_list; - -int perf_evsel_list__create_default(void); -void perf_evsel_list__delete(void); - struct option; struct tracepoint_path { @@ -25,8 +20,6 @@ struct tracepoint_path { extern struct tracepoint_path *tracepoint_id_to_path(u64 config); extern bool have_tracepoints(struct list_head *evlist); -extern int nr_counters; - const char *event_name(struct perf_evsel *event); extern const char *__event_name(int type, u64 config); From 5c581041cf97aa7980b442de81ddea8273d6dcde Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Jan 2011 22:30:02 -0200 Subject: [PATCH 011/706] perf evlist: Adopt the pollfd array Allocating just the space needed for nr_cpus * nr_threads * nr_evsels, not the MAX_NR_CPUS and counters. LKML-Reference: Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 20 +++++++------------- tools/perf/builtin-top.c | 26 +++++++++++--------------- tools/perf/util/evlist.c | 9 +++++++++ tools/perf/util/evlist.h | 6 ++++++ 4 files changed, 33 insertions(+), 28 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 252ace873d32..1614d89b4765 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -72,9 +72,6 @@ static struct perf_evlist *evsel_list; static long samples = 0; static u64 bytes_written = 0; -static struct pollfd *event_array; - -static int nr_poll = 0; static int nr_cpu = 0; static int file_new = 1; @@ -432,9 +429,9 @@ try_again: exit(-1); } - event_array[nr_poll].fd = FD(evsel, nr_cpu, thread_index); - event_array[nr_poll].events = POLLIN; - nr_poll++; + evlist->pollfd[evlist->nr_fds].fd = FD(evsel, nr_cpu, thread_index); + evlist->pollfd[evlist->nr_fds].events = POLLIN; + evlist->nr_fds++; } if (filter != NULL) { @@ -793,7 +790,7 @@ static int __cmd_record(int argc, const char **argv) if (hits == samples) { if (done) break; - err = poll(event_array, nr_poll, -1); + err = poll(evsel_list->pollfd, evsel_list->nr_fds, -1); waking++; } @@ -948,9 +945,8 @@ int cmd_record(int argc, const char **argv, const char *prefix __used) if (perf_header__push_event(pos->attr.config, event_name(pos))) goto out_free_fd; } - event_array = malloc((sizeof(struct pollfd) * MAX_NR_CPUS * - MAX_COUNTERS * threads->nr)); - if (!event_array) + + if (perf_evlist__alloc_pollfd(evsel_list, cpus->nr, threads->nr) < 0) goto out_free_fd; if (user_interval != ULLONG_MAX) @@ -968,13 +964,11 @@ int cmd_record(int argc, const char **argv, const char *prefix __used) } else { fprintf(stderr, "frequency and count are zero, aborting\n"); err = -EINVAL; - goto out_free_event_array; + goto out_free_fd; } err = __cmd_record(argc, argv); -out_free_event_array: - free(event_array); out_free_fd: thread_map__delete(threads); threads = NULL; diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 216b62ed4b89..1bc465215fc6 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1193,8 +1193,6 @@ static void perf_session__mmap_read_counter(struct perf_session *self, md->prev = old; } -static struct pollfd *event_array; - static void perf_session__mmap_read(struct perf_session *self) { struct perf_evsel *counter; @@ -1212,10 +1210,10 @@ static void perf_session__mmap_read(struct perf_session *self) } } -int nr_poll; int group_fd; -static void start_counter(int i, struct perf_evsel *evsel) +static void start_counter(int i, struct perf_evlist *evlist, + struct perf_evsel *evsel) { struct xyarray *mmap_array = evsel->priv; struct mmap_data *mm; @@ -1281,9 +1279,9 @@ try_again: if (group && group_fd == -1) group_fd = FD(evsel, i, thread_index); - event_array[nr_poll].fd = FD(evsel, i, thread_index); - event_array[nr_poll].events = POLLIN; - nr_poll++; + evlist->pollfd[evlist->nr_fds].fd = FD(evsel, i, thread_index); + evlist->pollfd[evlist->nr_fds].events = POLLIN; + evlist->nr_fds++; mm = xyarray__entry(mmap_array, i, thread_index); mm->prev = 0; @@ -1316,11 +1314,11 @@ static int __cmd_top(void) for (i = 0; i < cpus->nr; i++) { group_fd = -1; list_for_each_entry(counter, &evsel_list->entries, node) - start_counter(i, counter); + start_counter(i, evsel_list, counter); } /* Wait for a minimal set of events before starting the snapshot */ - poll(&event_array[0], nr_poll, 100); + poll(evsel_list->pollfd, evsel_list->nr_fds, 100); perf_session__mmap_read(session); @@ -1345,7 +1343,7 @@ static int __cmd_top(void) perf_session__mmap_read(session); if (hits == samples) - ret = poll(event_array, nr_poll, 100); + ret = poll(evsel_list->pollfd, evsel_list->nr_fds, 100); } return 0; @@ -1426,11 +1424,6 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) usage_with_options(top_usage, options); } - event_array = malloc((sizeof(struct pollfd) * - MAX_NR_CPUS * MAX_COUNTERS * threads->nr)); - if (!event_array) - return -ENOMEM; - /* CPU and PID are mutually exclusive */ if (target_tid > 0 && cpu_list) { printf("WARNING: PID switch overriding CPU\n"); @@ -1480,6 +1473,9 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) pos->attr.sample_period = default_interval; } + if (perf_evlist__alloc_pollfd(evsel_list, cpus->nr, threads->nr) < 0) + goto out_free_fd; + sym_evsel = list_entry(evsel_list->entries.next, struct perf_evsel, node); symbol_conf.priv_size = (sizeof(struct sym_entry) + diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 7b4faec23737..2abf949259d0 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1,3 +1,4 @@ +#include #include "evlist.h" #include "evsel.h" #include "util.h" @@ -28,6 +29,7 @@ static void perf_evlist__purge(struct perf_evlist *evlist) void perf_evlist__delete(struct perf_evlist *evlist) { perf_evlist__purge(evlist); + free(evlist->pollfd); free(evlist); } @@ -51,3 +53,10 @@ int perf_evlist__add_default(struct perf_evlist *evlist) perf_evlist__add(evlist, evsel); return 0; } + +int perf_evlist__alloc_pollfd(struct perf_evlist *evlist, int ncpus, int nthreads) +{ + int nfds = ncpus * nthreads * evlist->nr_entries; + evlist->pollfd = malloc(sizeof(struct pollfd) * nfds); + return evlist->pollfd != NULL ? 0 : -ENOMEM; +} diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 48db91a8abf3..a7d7e122e3c6 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -3,9 +3,13 @@ #include +struct pollfd; + struct perf_evlist { struct list_head entries; int nr_entries; + int nr_fds; + struct pollfd *pollfd; }; struct perf_evsel; @@ -16,4 +20,6 @@ void perf_evlist__delete(struct perf_evlist *evlist); void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry); int perf_evlist__add_default(struct perf_evlist *evlist); +int perf_evlist__alloc_pollfd(struct perf_evlist *evlist, int ncpus, int nthreads); + #endif /* __PERF_EVLIST_H */ From f08199d314458610d4ca52f8e86e0a4ec7a7bc54 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Jan 2011 23:42:19 -0200 Subject: [PATCH 012/706] perf evsel: Support event groups The perf_evsel__open now have an extra boolean argument specifying if event grouping is desired. The first file descriptor created on a CPU becomes the group leader. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 4 ++-- tools/perf/builtin-test.c | 4 ++-- tools/perf/util/evsel.c | 27 +++++++++++++++++---------- tools/perf/util/evsel.h | 10 ++++++---- 4 files changed, 27 insertions(+), 18 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index da9090245934..b5fe522f11dc 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -169,7 +169,7 @@ static int create_perf_stat_counter(struct perf_evsel *evsel) PERF_FORMAT_TOTAL_TIME_RUNNING; if (system_wide) - return perf_evsel__open_per_cpu(evsel, cpus); + return perf_evsel__open_per_cpu(evsel, cpus, false); attr->inherit = !no_inherit; if (target_pid == -1 && target_tid == -1) { @@ -177,7 +177,7 @@ static int create_perf_stat_counter(struct perf_evsel *evsel) attr->enable_on_exec = 1; } - return perf_evsel__open_per_thread(evsel, threads); + return perf_evsel__open_per_thread(evsel, threads, false); } /* diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c index 5dcdba653d70..4282d671b161 100644 --- a/tools/perf/builtin-test.c +++ b/tools/perf/builtin-test.c @@ -289,7 +289,7 @@ static int test__open_syscall_event(void) goto out_thread_map_delete; } - if (perf_evsel__open_per_thread(evsel, threads) < 0) { + if (perf_evsel__open_per_thread(evsel, threads, false) < 0) { pr_debug("failed to open counter: %s, " "tweak /proc/sys/kernel/perf_event_paranoid?\n", strerror(errno)); @@ -364,7 +364,7 @@ static int test__open_syscall_event_on_all_cpus(void) goto out_thread_map_delete; } - if (perf_evsel__open(evsel, cpus, threads) < 0) { + if (perf_evsel__open(evsel, cpus, threads, false) < 0) { pr_debug("failed to open counter: %s, " "tweak /proc/sys/kernel/perf_event_paranoid?\n", strerror(errno)); diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index f5cfed60af98..da473ec93c75 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -128,7 +128,7 @@ int __perf_evsel__read(struct perf_evsel *evsel, } static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus, - struct thread_map *threads) + struct thread_map *threads, bool group) { int cpu, thread; @@ -137,12 +137,18 @@ static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus, return -1; for (cpu = 0; cpu < cpus->nr; cpu++) { + int group_fd = -1; + for (thread = 0; thread < threads->nr; thread++) { FD(evsel, cpu, thread) = sys_perf_event_open(&evsel->attr, threads->map[thread], - cpus->map[cpu], -1, 0); + cpus->map[cpu], + group_fd, 0); if (FD(evsel, cpu, thread) < 0) goto out_close; + + if (group && group_fd == -1) + group_fd = FD(evsel, cpu, thread); } } @@ -175,10 +181,9 @@ static struct { .threads = { -1, }, }; -int perf_evsel__open(struct perf_evsel *evsel, - struct cpu_map *cpus, struct thread_map *threads) +int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus, + struct thread_map *threads, bool group) { - if (cpus == NULL) { /* Work around old compiler warnings about strict aliasing */ cpus = &empty_cpu_map.map; @@ -187,15 +192,17 @@ int perf_evsel__open(struct perf_evsel *evsel, if (threads == NULL) threads = &empty_thread_map.map; - return __perf_evsel__open(evsel, cpus, threads); + return __perf_evsel__open(evsel, cpus, threads, group); } -int perf_evsel__open_per_cpu(struct perf_evsel *evsel, struct cpu_map *cpus) +int perf_evsel__open_per_cpu(struct perf_evsel *evsel, + struct cpu_map *cpus, bool group) { - return __perf_evsel__open(evsel, cpus, &empty_thread_map.map); + return __perf_evsel__open(evsel, cpus, &empty_thread_map.map, group); } -int perf_evsel__open_per_thread(struct perf_evsel *evsel, struct thread_map *threads) +int perf_evsel__open_per_thread(struct perf_evsel *evsel, + struct thread_map *threads, bool group) { - return __perf_evsel__open(evsel, &empty_cpu_map.map, threads); + return __perf_evsel__open(evsel, &empty_cpu_map.map, threads, group); } diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index b2d755fe88a5..0962b500cb6d 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -45,10 +45,12 @@ int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus); void perf_evsel__free_fd(struct perf_evsel *evsel); void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads); -int perf_evsel__open_per_cpu(struct perf_evsel *evsel, struct cpu_map *cpus); -int perf_evsel__open_per_thread(struct perf_evsel *evsel, struct thread_map *threads); -int perf_evsel__open(struct perf_evsel *evsel, - struct cpu_map *cpus, struct thread_map *threads); +int perf_evsel__open_per_cpu(struct perf_evsel *evsel, + struct cpu_map *cpus, bool group); +int perf_evsel__open_per_thread(struct perf_evsel *evsel, + struct thread_map *threads, bool group); +int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus, + struct thread_map *threads, bool group); #define perf_evsel__match(evsel, t, c) \ (evsel->attr.type == PERF_TYPE_##t && \ From 9d04f1781772e11bd58806391555fc23ebb54377 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 12 Jan 2011 00:08:18 -0200 Subject: [PATCH 013/706] perf evsel: Allow specifying if the inherit bit should be set As this is a per-cpu attribute, we can't set it up in advance and use it for all the calls. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 4 ++-- tools/perf/builtin-test.c | 4 ++-- tools/perf/util/evsel.c | 16 +++++++++------- tools/perf/util/evsel.h | 6 +++--- 4 files changed, 16 insertions(+), 14 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index b5fe522f11dc..e2a2d02c5c43 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -169,7 +169,7 @@ static int create_perf_stat_counter(struct perf_evsel *evsel) PERF_FORMAT_TOTAL_TIME_RUNNING; if (system_wide) - return perf_evsel__open_per_cpu(evsel, cpus, false); + return perf_evsel__open_per_cpu(evsel, cpus, false, false); attr->inherit = !no_inherit; if (target_pid == -1 && target_tid == -1) { @@ -177,7 +177,7 @@ static int create_perf_stat_counter(struct perf_evsel *evsel) attr->enable_on_exec = 1; } - return perf_evsel__open_per_thread(evsel, threads, false); + return perf_evsel__open_per_thread(evsel, threads, false, false); } /* diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c index 4282d671b161..7287158c4830 100644 --- a/tools/perf/builtin-test.c +++ b/tools/perf/builtin-test.c @@ -289,7 +289,7 @@ static int test__open_syscall_event(void) goto out_thread_map_delete; } - if (perf_evsel__open_per_thread(evsel, threads, false) < 0) { + if (perf_evsel__open_per_thread(evsel, threads, false, false) < 0) { pr_debug("failed to open counter: %s, " "tweak /proc/sys/kernel/perf_event_paranoid?\n", strerror(errno)); @@ -364,7 +364,7 @@ static int test__open_syscall_event_on_all_cpus(void) goto out_thread_map_delete; } - if (perf_evsel__open(evsel, cpus, threads, false) < 0) { + if (perf_evsel__open(evsel, cpus, threads, false, false) < 0) { pr_debug("failed to open counter: %s, " "tweak /proc/sys/kernel/perf_event_paranoid?\n", strerror(errno)); diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index da473ec93c75..82a00536892a 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -128,7 +128,7 @@ int __perf_evsel__read(struct perf_evsel *evsel, } static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus, - struct thread_map *threads, bool group) + struct thread_map *threads, bool group, bool inherit) { int cpu, thread; @@ -139,6 +139,8 @@ static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus, for (cpu = 0; cpu < cpus->nr; cpu++) { int group_fd = -1; + evsel->attr.inherit = (cpus->map[cpu] < 0) && inherit; + for (thread = 0; thread < threads->nr; thread++) { FD(evsel, cpu, thread) = sys_perf_event_open(&evsel->attr, threads->map[thread], @@ -182,7 +184,7 @@ static struct { }; int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus, - struct thread_map *threads, bool group) + struct thread_map *threads, bool group, bool inherit) { if (cpus == NULL) { /* Work around old compiler warnings about strict aliasing */ @@ -192,17 +194,17 @@ int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus, if (threads == NULL) threads = &empty_thread_map.map; - return __perf_evsel__open(evsel, cpus, threads, group); + return __perf_evsel__open(evsel, cpus, threads, group, inherit); } int perf_evsel__open_per_cpu(struct perf_evsel *evsel, - struct cpu_map *cpus, bool group) + struct cpu_map *cpus, bool group, bool inherit) { - return __perf_evsel__open(evsel, cpus, &empty_thread_map.map, group); + return __perf_evsel__open(evsel, cpus, &empty_thread_map.map, group, inherit); } int perf_evsel__open_per_thread(struct perf_evsel *evsel, - struct thread_map *threads, bool group) + struct thread_map *threads, bool group, bool inherit) { - return __perf_evsel__open(evsel, &empty_cpu_map.map, threads, group); + return __perf_evsel__open(evsel, &empty_cpu_map.map, threads, group, inherit); } diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 0962b500cb6d..1594696bd127 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -46,11 +46,11 @@ void perf_evsel__free_fd(struct perf_evsel *evsel); void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads); int perf_evsel__open_per_cpu(struct perf_evsel *evsel, - struct cpu_map *cpus, bool group); + struct cpu_map *cpus, bool group, bool inherit); int perf_evsel__open_per_thread(struct perf_evsel *evsel, - struct thread_map *threads, bool group); + struct thread_map *threads, bool group, bool inherit); int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus, - struct thread_map *threads, bool group); + struct thread_map *threads, bool group, bool inherit); #define perf_evsel__match(evsel, t, c) \ (evsel->attr.type == PERF_TYPE_##t && \ From 72cb7013e08dec29631e0447f9496b7bacd3e14b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 12 Jan 2011 10:52:47 -0200 Subject: [PATCH 014/706] perf top: Use perf_evsel__open Now that it handles group_fd and inherit we can use it, sharing it with stat. Next step: 'perf record' should use, then move the mmap_array out of ->priv and into perf_evsel, with top and record sharing this, and at the same time, write a 'perf test' stress test. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-top.c | 122 ++++++++++++++++++--------------------- 1 file changed, 57 insertions(+), 65 deletions(-) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 1bc465215fc6..15d89bede2fb 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1210,75 +1210,17 @@ static void perf_session__mmap_read(struct perf_session *self) } } -int group_fd; - static void start_counter(int i, struct perf_evlist *evlist, struct perf_evsel *evsel) { struct xyarray *mmap_array = evsel->priv; struct mmap_data *mm; - struct perf_event_attr *attr; - int cpu = -1; int thread_index; - if (target_tid == -1) - cpu = cpus->map[i]; - - attr = &evsel->attr; - - attr->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID; - - if (freq) { - attr->sample_type |= PERF_SAMPLE_PERIOD; - attr->freq = 1; - attr->sample_freq = freq; - } - - attr->inherit = (cpu < 0) && inherit; - attr->mmap = 1; - for (thread_index = 0; thread_index < threads->nr; thread_index++) { -try_again: - FD(evsel, i, thread_index) = sys_perf_event_open(attr, - threads->map[thread_index], cpu, group_fd, 0); - - if (FD(evsel, i, thread_index) < 0) { - int err = errno; - - if (err == EPERM || err == EACCES) - die("Permission error - are you root?\n" - "\t Consider tweaking" - " /proc/sys/kernel/perf_event_paranoid.\n"); - /* - * If it's cycles then fall back to hrtimer - * based cpu-clock-tick sw counter, which - * is always available even if no PMU support: - */ - if (attr->type == PERF_TYPE_HARDWARE - && attr->config == PERF_COUNT_HW_CPU_CYCLES) { - - if (verbose) - warning(" ... trying to fall back to cpu-clock-ticks\n"); - - attr->type = PERF_TYPE_SOFTWARE; - attr->config = PERF_COUNT_SW_CPU_CLOCK; - goto try_again; - } - printf("\n"); - error("sys_perf_event_open() syscall returned with %d (%s). /bin/dmesg may provide additional information.\n", - FD(evsel, i, thread_index), strerror(err)); - die("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); - exit(-1); - } assert(FD(evsel, i, thread_index) >= 0); fcntl(FD(evsel, i, thread_index), F_SETFL, O_NONBLOCK); - /* - * First counter acts as the group leader: - */ - if (group && group_fd == -1) - group_fd = FD(evsel, i, thread_index); - evlist->pollfd[evlist->nr_fds].fd = FD(evsel, i, thread_index); evlist->pollfd[evlist->nr_fds].events = POLLIN; evlist->nr_fds++; @@ -1293,11 +1235,65 @@ try_again: } } +static void start_counters(struct perf_evlist *evlist) +{ + struct perf_evsel *counter; + int i; + + list_for_each_entry(counter, &evlist->entries, node) { + struct perf_event_attr *attr = &counter->attr; + + attr->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID; + + if (freq) { + attr->sample_type |= PERF_SAMPLE_PERIOD; + attr->freq = 1; + attr->sample_freq = freq; + } + + attr->mmap = 1; +try_again: + if (perf_evsel__open(counter, cpus, threads, group, inherit) < 0) { + int err = errno; + + if (err == EPERM || err == EACCES) + die("Permission error - are you root?\n" + "\t Consider tweaking" + " /proc/sys/kernel/perf_event_paranoid.\n"); + /* + * If it's cycles then fall back to hrtimer + * based cpu-clock-tick sw counter, which + * is always available even if no PMU support: + */ + if (attr->type == PERF_TYPE_HARDWARE && + attr->config == PERF_COUNT_HW_CPU_CYCLES) { + + if (verbose) + warning(" ... trying to fall back to cpu-clock-ticks\n"); + + attr->type = PERF_TYPE_SOFTWARE; + attr->config = PERF_COUNT_SW_CPU_CLOCK; + goto try_again; + } + printf("\n"); + error("sys_perf_event_open() syscall returned with %d " + "(%s). /bin/dmesg may provide additional information.\n", + err, strerror(err)); + die("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); + exit(-1); + } + } + + for (i = 0; i < cpus->nr; i++) { + list_for_each_entry(counter, &evlist->entries, node) + start_counter(i, evsel_list, counter); + } +} + static int __cmd_top(void) { pthread_t thread; - struct perf_evsel *counter; - int i, ret; + int ret; /* * FIXME: perf_session__new should allow passing a O_MMAP, so that all this * mmap reading, etc is encapsulated in it. Use O_WRONLY for now. @@ -1311,11 +1307,7 @@ static int __cmd_top(void) else event__synthesize_threads(event__process, session); - for (i = 0; i < cpus->nr; i++) { - group_fd = -1; - list_for_each_entry(counter, &evsel_list->entries, node) - start_counter(i, evsel_list, counter); - } + start_counters(evsel_list); /* Wait for a minimal set of events before starting the snapshot */ poll(evsel_list->pollfd, evsel_list->nr_fds, 100); From dd7927f4f8ee75b032ff15aeef4bda49719a443a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 12 Jan 2011 14:28:51 -0200 Subject: [PATCH 015/706] perf record: Use perf_evsel__open Now its time to factor out the mmap handling bits into the perf_evsel class. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 233 +++++++++++++++++------------------- 1 file changed, 113 insertions(+), 120 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 1614d89b4765..ec43f2eb7b72 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -72,8 +72,6 @@ static struct perf_evlist *evsel_list; static long samples = 0; static u64 bytes_written = 0; -static int nr_cpu = 0; - static int file_new = 1; static off_t post_processing_offset; @@ -208,8 +206,6 @@ static void sig_atexit(void) kill(getpid(), signr); } -static int group_fd; - static struct perf_header_attr *get_header_attr(struct perf_event_attr *a, int nr) { struct perf_header_attr *h_attr; @@ -234,7 +230,6 @@ static void create_counter(struct perf_evlist *evlist, char *filter = evsel->filter; struct perf_event_attr *attr = &evsel->attr; struct perf_header_attr *h_attr; - int track = !evsel->idx; /* only the first counter needs these */ int thread_index; int ret; struct { @@ -243,19 +238,77 @@ static void create_counter(struct perf_evlist *evlist, u64 time_running; u64 id; } read_data; - /* - * Check if parse_single_tracepoint_event has already asked for - * PERF_SAMPLE_TIME. - * - * XXX this is kludgy but short term fix for problems introduced by - * eac23d1c that broke 'perf script' by having different sample_types - * when using multiple tracepoint events when we use a perf binary - * that tries to use sample_id_all on an older kernel. - * - * We need to move counter creation to perf_session, support - * different sample_types, etc. - */ - bool time_needed = attr->sample_type & PERF_SAMPLE_TIME; + + for (thread_index = 0; thread_index < threads->nr; thread_index++) { + h_attr = get_header_attr(attr, evsel->idx); + if (h_attr == NULL) + die("nomem\n"); + + if (!file_new) { + if (memcmp(&h_attr->attr, attr, sizeof(*attr))) { + fprintf(stderr, "incompatible append\n"); + exit(-1); + } + } + + if (read(FD(evsel, cpu, thread_index), &read_data, sizeof(read_data)) == -1) { + perror("Unable to read perf file descriptor"); + exit(-1); + } + + if (perf_header_attr__add_id(h_attr, read_data.id) < 0) { + pr_warning("Not enough memory to add id\n"); + exit(-1); + } + + assert(FD(evsel, cpu, thread_index) >= 0); + fcntl(FD(evsel, cpu, thread_index), F_SETFL, O_NONBLOCK); + + if (evsel->idx || thread_index) { + struct perf_evsel *first; + first = list_entry(evlist->entries.next, struct perf_evsel, node); + ret = ioctl(FD(evsel, cpu, thread_index), + PERF_EVENT_IOC_SET_OUTPUT, + FD(first, cpu, 0)); + if (ret) { + error("failed to set output: %d (%s)\n", errno, + strerror(errno)); + exit(-1); + } + } else { + mmap_array[cpu].prev = 0; + mmap_array[cpu].mask = mmap_pages*page_size - 1; + mmap_array[cpu].base = mmap(NULL, (mmap_pages+1)*page_size, + PROT_READ | PROT_WRITE, MAP_SHARED, FD(evsel, cpu, thread_index), 0); + if (mmap_array[cpu].base == MAP_FAILED) { + error("failed to mmap with %d (%s)\n", errno, strerror(errno)); + exit(-1); + } + + evlist->pollfd[evlist->nr_fds].fd = FD(evsel, cpu, thread_index); + evlist->pollfd[evlist->nr_fds].events = POLLIN; + evlist->nr_fds++; + } + + if (filter != NULL) { + ret = ioctl(FD(evsel, cpu, thread_index), + PERF_EVENT_IOC_SET_FILTER, filter); + if (ret) { + error("failed to set filter with %d (%s)\n", errno, + strerror(errno)); + exit(-1); + } + } + } + + if (!sample_type) + sample_type = attr->sample_type; +} + +static void config_attr(struct perf_evsel *evsel, struct perf_evlist *evlist) +{ + struct perf_event_attr *attr = &evsel->attr; + int track = !evsel->idx; /* only the first counter needs these */ attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING | @@ -315,19 +368,39 @@ static void create_counter(struct perf_evlist *evlist, attr->mmap = track; attr->comm = track; - attr->inherit = !no_inherit; + if (target_pid == -1 && target_tid == -1 && !system_wide) { attr->disabled = 1; attr->enable_on_exec = 1; } +} + +static void open_counters(struct perf_evlist *evlist) +{ + struct perf_evsel *pos; + int cpu; + + list_for_each_entry(pos, &evlist->entries, node) { + struct perf_event_attr *attr = &pos->attr; + /* + * Check if parse_single_tracepoint_event has already asked for + * PERF_SAMPLE_TIME. + * + * XXX this is kludgy but short term fix for problems introduced by + * eac23d1c that broke 'perf script' by having different sample_types + * when using multiple tracepoint events when we use a perf binary + * that tries to use sample_id_all on an older kernel. + * + * We need to move counter creation to perf_session, support + * different sample_types, etc. + */ + bool time_needed = attr->sample_type & PERF_SAMPLE_TIME; + + config_attr(pos, evlist); retry_sample_id: - attr->sample_id_all = sample_id_all_avail ? 1 : 0; - - for (thread_index = 0; thread_index < threads->nr; thread_index++) { + attr->sample_id_all = sample_id_all_avail ? 1 : 0; try_again: - FD(evsel, nr_cpu, thread_index) = sys_perf_event_open(attr, threads->map[thread_index], cpu, group_fd, 0); - - if (FD(evsel, nr_cpu, thread_index) < 0) { + if (perf_evsel__open(pos, cpus, threads, group, !no_inherit) < 0) { int err = errno; if (err == EPERM || err == EACCES) @@ -364,7 +437,7 @@ try_again: } printf("\n"); error("sys_perf_event_open() syscall returned with %d (%s). /bin/dmesg may provide additional information.\n", - FD(evsel, nr_cpu, thread_index), strerror(err)); + err, strerror(err)); #if defined(__i386__) || defined(__x86_64__) if (attr->type == PERF_TYPE_HARDWARE && err == EOPNOTSUPP) @@ -375,90 +448,13 @@ try_again: #endif die("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); - exit(-1); - } - - h_attr = get_header_attr(attr, evsel->idx); - if (h_attr == NULL) - die("nomem\n"); - - if (!file_new) { - if (memcmp(&h_attr->attr, attr, sizeof(*attr))) { - fprintf(stderr, "incompatible append\n"); - exit(-1); - } - } - - if (read(FD(evsel, nr_cpu, thread_index), &read_data, sizeof(read_data)) == -1) { - perror("Unable to read perf file descriptor"); - exit(-1); - } - - if (perf_header_attr__add_id(h_attr, read_data.id) < 0) { - pr_warning("Not enough memory to add id\n"); - exit(-1); - } - - assert(FD(evsel, nr_cpu, thread_index) >= 0); - fcntl(FD(evsel, nr_cpu, thread_index), F_SETFL, O_NONBLOCK); - - /* - * First counter acts as the group leader: - */ - if (group && group_fd == -1) - group_fd = FD(evsel, nr_cpu, thread_index); - - if (evsel->idx || thread_index) { - struct perf_evsel *first; - first = list_entry(evlist->entries.next, struct perf_evsel, node); - ret = ioctl(FD(evsel, nr_cpu, thread_index), - PERF_EVENT_IOC_SET_OUTPUT, - FD(first, nr_cpu, 0)); - if (ret) { - error("failed to set output: %d (%s)\n", errno, - strerror(errno)); - exit(-1); - } - } else { - mmap_array[nr_cpu].prev = 0; - mmap_array[nr_cpu].mask = mmap_pages*page_size - 1; - mmap_array[nr_cpu].base = mmap(NULL, (mmap_pages+1)*page_size, - PROT_READ | PROT_WRITE, MAP_SHARED, FD(evsel, nr_cpu, thread_index), 0); - if (mmap_array[nr_cpu].base == MAP_FAILED) { - error("failed to mmap with %d (%s)\n", errno, strerror(errno)); - exit(-1); - } - - evlist->pollfd[evlist->nr_fds].fd = FD(evsel, nr_cpu, thread_index); - evlist->pollfd[evlist->nr_fds].events = POLLIN; - evlist->nr_fds++; - } - - if (filter != NULL) { - ret = ioctl(FD(evsel, nr_cpu, thread_index), - PERF_EVENT_IOC_SET_FILTER, filter); - if (ret) { - error("failed to set filter with %d (%s)\n", errno, - strerror(errno)); - exit(-1); - } } } - if (!sample_type) - sample_type = attr->sample_type; -} - -static void open_counters(struct perf_evlist *evlist, int cpu) -{ - struct perf_evsel *pos; - - group_fd = -1; - - list_for_each_entry(pos, &evlist->entries, node) - create_counter(evlist, pos, cpu); - - nr_cpu++; + for (cpu = 0; cpu < cpus->nr; ++cpu) { + list_for_each_entry(pos, &evlist->entries, node) + create_counter(evlist, pos, cpu); + } } static int process_buildids(void) @@ -533,7 +529,7 @@ static void mmap_read_all(void) { int i; - for (i = 0; i < nr_cpu; i++) { + for (i = 0; i < cpus->nr; i++) { if (mmap_array[i].base) mmap_read(&mmap_array[i]); } @@ -673,12 +669,7 @@ static int __cmd_record(int argc, const char **argv) close(child_ready_pipe[0]); } - if (!system_wide && no_inherit && !cpu_list) { - open_counters(evsel_list, -1); - } else { - for (i = 0; i < cpus->nr; i++) - open_counters(evsel_list, cpus->map[i]); - } + open_counters(evsel_list); perf_session__set_sample_type(session, sample_type); @@ -795,7 +786,7 @@ static int __cmd_record(int argc, const char **argv) } if (done) { - for (i = 0; i < nr_cpu; i++) { + for (i = 0; i < cpus->nr; i++) { struct perf_evsel *pos; list_for_each_entry(pos, &evsel_list->entries, node) { @@ -933,11 +924,13 @@ int cmd_record(int argc, const char **argv, const char *prefix __used) usage_with_options(record_usage, record_options); } - cpus = cpu_map__new(cpu_list); - if (cpus == NULL) { - perror("failed to parse CPUs map"); - return -1; - } + if (target_tid != -1) + cpus = cpu_map__dummy_new(); + else + cpus = cpu_map__new(cpu_list); + + if (cpus == NULL) + usage_with_options(record_usage, record_options); list_for_each_entry(pos, &evsel_list->entries, node) { if (perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0) From 70082dd92c4b288bd723a77897e2b555f0e63113 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 12 Jan 2011 17:03:24 -0200 Subject: [PATCH 016/706] perf evsel: Introduce mmap support Out of the code in 'perf top'. Record is next in line. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-top.c | 71 +++------------------------------------- tools/perf/perf.h | 14 ++++++++ tools/perf/util/evlist.c | 8 +++++ tools/perf/util/evlist.h | 1 + tools/perf/util/evsel.c | 71 ++++++++++++++++++++++++++++++++++++++++ tools/perf/util/evsel.h | 8 +++++ 6 files changed, 107 insertions(+), 66 deletions(-) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 15d89bede2fb..7d723ad0bfa9 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1095,43 +1095,12 @@ static void event__process_sample(const event_t *self, } } -struct mmap_data { - void *base; - int mask; - unsigned int prev; -}; - -static int perf_evsel__alloc_mmap_per_thread(struct perf_evsel *evsel, - int ncpus, int nthreads) -{ - evsel->priv = xyarray__new(ncpus, nthreads, sizeof(struct mmap_data)); - return evsel->priv != NULL ? 0 : -ENOMEM; -} - -static void perf_evsel__free_mmap(struct perf_evsel *evsel) -{ - xyarray__delete(evsel->priv); - evsel->priv = NULL; -} - -static unsigned int mmap_read_head(struct mmap_data *md) -{ - struct perf_event_mmap_page *pc = md->base; - int head; - - head = pc->data_head; - rmb(); - - return head; -} - static void perf_session__mmap_read_counter(struct perf_session *self, struct perf_evsel *evsel, int cpu, int thread_idx) { - struct xyarray *mmap_array = evsel->priv; - struct mmap_data *md = xyarray__entry(mmap_array, cpu, thread_idx); - unsigned int head = mmap_read_head(md); + struct perf_mmap *md = xyarray__entry(evsel->mmap, cpu, thread_idx); + unsigned int head = perf_mmap__read_head(md); unsigned int old = md->prev; unsigned char *data = md->base + page_size; struct sample_data sample; @@ -1210,35 +1179,9 @@ static void perf_session__mmap_read(struct perf_session *self) } } -static void start_counter(int i, struct perf_evlist *evlist, - struct perf_evsel *evsel) -{ - struct xyarray *mmap_array = evsel->priv; - struct mmap_data *mm; - int thread_index; - - for (thread_index = 0; thread_index < threads->nr; thread_index++) { - assert(FD(evsel, i, thread_index) >= 0); - fcntl(FD(evsel, i, thread_index), F_SETFL, O_NONBLOCK); - - evlist->pollfd[evlist->nr_fds].fd = FD(evsel, i, thread_index); - evlist->pollfd[evlist->nr_fds].events = POLLIN; - evlist->nr_fds++; - - mm = xyarray__entry(mmap_array, i, thread_index); - mm->prev = 0; - mm->mask = mmap_pages*page_size - 1; - mm->base = mmap(NULL, (mmap_pages+1)*page_size, - PROT_READ, MAP_SHARED, FD(evsel, i, thread_index), 0); - if (mm->base == MAP_FAILED) - die("failed to mmap with %d (%s)\n", errno, strerror(errno)); - } -} - static void start_counters(struct perf_evlist *evlist) { struct perf_evsel *counter; - int i; list_for_each_entry(counter, &evlist->entries, node) { struct perf_event_attr *attr = &counter->attr; @@ -1282,11 +1225,9 @@ try_again: die("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); exit(-1); } - } - for (i = 0; i < cpus->nr; i++) { - list_for_each_entry(counter, &evlist->entries, node) - start_counter(i, evsel_list, counter); + if (perf_evsel__mmap(counter, cpus, threads, mmap_pages, evlist) < 0) + die("failed to mmap with %d (%s)\n", errno, strerror(errno)); } } @@ -1453,7 +1394,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) usage_with_options(top_usage, options); list_for_each_entry(pos, &evsel_list->entries, node) { - if (perf_evsel__alloc_mmap_per_thread(pos, cpus->nr, threads->nr) < 0 || + if (perf_evsel__alloc_mmap(pos, cpus->nr, threads->nr) < 0 || perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0) goto out_free_fd; /* @@ -1485,8 +1426,6 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) status = __cmd_top(); out_free_fd: - list_for_each_entry(pos, &evsel_list->entries, node) - perf_evsel__free_mmap(pos); perf_evlist__delete(evsel_list); return status; diff --git a/tools/perf/perf.h b/tools/perf/perf.h index 95aaf565c704..5fb5e1f11d1c 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -94,6 +94,20 @@ void get_term_dimensions(struct winsize *ws); #include "util/types.h" #include +struct perf_mmap { + void *base; + int mask; + unsigned int prev; +}; + +static inline unsigned int perf_mmap__read_head(struct perf_mmap *mm) +{ + struct perf_event_mmap_page *pc = mm->base; + int head = pc->data_head; + rmb(); + return head; +} + /* * prctl(PR_TASK_PERF_EVENTS_DISABLE) will (cheaply) disable all * counters in the current task. diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 2abf949259d0..6d4129214ee8 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -60,3 +60,11 @@ int perf_evlist__alloc_pollfd(struct perf_evlist *evlist, int ncpus, int nthread evlist->pollfd = malloc(sizeof(struct pollfd) * nfds); return evlist->pollfd != NULL ? 0 : -ENOMEM; } + +void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd) +{ + fcntl(fd, F_SETFL, O_NONBLOCK); + evlist->pollfd[evlist->nr_fds].fd = fd; + evlist->pollfd[evlist->nr_fds].events = POLLIN; + evlist->nr_fds++; +} diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index a7d7e122e3c6..16bbfcba8ca8 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -21,5 +21,6 @@ void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry); int perf_evlist__add_default(struct perf_evlist *evlist); int perf_evlist__alloc_pollfd(struct perf_evlist *evlist, int ncpus, int nthreads); +void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd); #endif /* __PERF_EVLIST_H */ diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 82a00536892a..f5006958f8da 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1,9 +1,13 @@ #include "evsel.h" +#include "evlist.h" #include "../perf.h" #include "util.h" #include "cpumap.h" #include "thread.h" +#include +#include + #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y)) struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx) @@ -49,10 +53,32 @@ void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads) } } +void perf_evsel__munmap(struct perf_evsel *evsel, int ncpus, int nthreads) +{ + struct perf_mmap *mm; + int cpu, thread; + + for (cpu = 0; cpu < ncpus; cpu++) + for (thread = 0; thread < nthreads; ++thread) { + mm = xyarray__entry(evsel->mmap, cpu, thread); + if (mm->base != NULL) { + munmap(mm->base, evsel->mmap_len); + mm->base = NULL; + } + } +} + +int perf_evsel__alloc_mmap(struct perf_evsel *evsel, int ncpus, int nthreads) +{ + evsel->mmap = xyarray__new(ncpus, nthreads, sizeof(struct perf_mmap)); + return evsel->mmap != NULL ? 0 : -ENOMEM; +} + void perf_evsel__delete(struct perf_evsel *evsel) { assert(list_empty(&evsel->node)); xyarray__delete(evsel->fd); + xyarray__delete(evsel->mmap); free(evsel); } @@ -208,3 +234,48 @@ int perf_evsel__open_per_thread(struct perf_evsel *evsel, { return __perf_evsel__open(evsel, &empty_cpu_map.map, threads, group, inherit); } + +int perf_evsel__mmap(struct perf_evsel *evsel, struct cpu_map *cpus, + struct thread_map *threads, int pages, + struct perf_evlist *evlist) +{ + unsigned int page_size = sysconf(_SC_PAGE_SIZE); + int mask = pages * page_size - 1, cpu; + struct perf_mmap *mm; + int thread; + + if (evsel->mmap == NULL && + perf_evsel__alloc_mmap(evsel, cpus->nr, threads->nr) < 0) + return -ENOMEM; + + evsel->mmap_len = (pages + 1) * page_size; + + for (cpu = 0; cpu < cpus->nr; cpu++) { + for (thread = 0; thread < threads->nr; thread++) { + mm = xyarray__entry(evsel->mmap, cpu, thread); + mm->prev = 0; + mm->mask = mask; + mm->base = mmap(NULL, evsel->mmap_len, PROT_READ, + MAP_SHARED, FD(evsel, cpu, thread), 0); + if (mm->base == MAP_FAILED) + goto out_unmap; + + if (evlist != NULL) + perf_evlist__add_pollfd(evlist, FD(evsel, cpu, thread)); + } + } + + return 0; + +out_unmap: + do { + while (--thread >= 0) { + mm = xyarray__entry(evsel->mmap, cpu, thread); + munmap(mm->base, evsel->mmap_len); + mm->base = NULL; + } + thread = threads->nr; + } while (--cpu >= 0); + + return -1; +} diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 1594696bd127..c8fbef299436 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -29,19 +29,23 @@ struct perf_evsel { struct perf_event_attr attr; char *filter; struct xyarray *fd; + struct xyarray *mmap; struct perf_counts *counts; + size_t mmap_len; int idx; void *priv; }; struct cpu_map; struct thread_map; +struct perf_evlist; struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx); void perf_evsel__delete(struct perf_evsel *evsel); int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads); int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus); +int perf_evsel__alloc_mmap(struct perf_evsel *evsel, int ncpus, int nthreads); void perf_evsel__free_fd(struct perf_evsel *evsel); void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads); @@ -51,6 +55,10 @@ int perf_evsel__open_per_thread(struct perf_evsel *evsel, struct thread_map *threads, bool group, bool inherit); int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus, struct thread_map *threads, bool group, bool inherit); +int perf_evsel__mmap(struct perf_evsel *evsel, struct cpu_map *cpus, + struct thread_map *threads, int pages, + struct perf_evlist *evlist); +void perf_evsel__munmap(struct perf_evsel *evsel, int ncpus, int nthreads); #define perf_evsel__match(evsel, t, c) \ (evsel->attr.type == PERF_TYPE_##t && \ From 744bd8aa3c8b43447f689a27872fa95e700b8a4f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 12 Jan 2011 17:07:28 -0200 Subject: [PATCH 017/706] perf record: Use struct perf_mmap and helpers Paving the way to using perf_evsel->mmap, do this to reduce the patch noise in the next ones. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 25 ++++--------------------- 1 file changed, 4 insertions(+), 21 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index ec43f2eb7b72..d89e2f106a62 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -78,26 +78,9 @@ static off_t post_processing_offset; static struct perf_session *session; static const char *cpu_list; -struct mmap_data { - void *base; - unsigned int mask; - unsigned int prev; -}; +static struct perf_mmap mmap_array[MAX_NR_CPUS]; -static struct mmap_data mmap_array[MAX_NR_CPUS]; - -static unsigned long mmap_read_head(struct mmap_data *md) -{ - struct perf_event_mmap_page *pc = md->base; - long head; - - head = pc->data_head; - rmb(); - - return head; -} - -static void mmap_write_tail(struct mmap_data *md, unsigned long tail) +static void mmap_write_tail(struct perf_mmap *md, unsigned long tail) { struct perf_event_mmap_page *pc = md->base; @@ -136,9 +119,9 @@ static int process_synthesized_event(event_t *event, return 0; } -static void mmap_read(struct mmap_data *md) +static void mmap_read(struct perf_mmap *md) { - unsigned int head = mmap_read_head(md); + unsigned int head = perf_mmap__read_head(md); unsigned int old = md->prev; unsigned char *data = md->base + page_size; unsigned long size; From 115d2d8963a426670ac3ce983fc4c4e001703943 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 12 Jan 2011 17:11:53 -0200 Subject: [PATCH 018/706] perf record: Move perf_mmap__write_tail to perf.h Close to perf_mmap__read_head() and the perf_mmap struct definition. This is useful for any recorder, and we will need it in 'perf test'. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 13 +------------ tools/perf/perf.h | 12 ++++++++++++ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index d89e2f106a62..109f3b269ac5 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -80,17 +80,6 @@ static const char *cpu_list; static struct perf_mmap mmap_array[MAX_NR_CPUS]; -static void mmap_write_tail(struct perf_mmap *md, unsigned long tail) -{ - struct perf_event_mmap_page *pc = md->base; - - /* - * ensure all reads are done before we write the tail out. - */ - /* mb(); */ - pc->data_tail = tail; -} - static void advance_output(size_t size) { bytes_written += size; @@ -165,7 +154,7 @@ static void mmap_read(struct perf_mmap *md) write_output(buf, size); md->prev = old; - mmap_write_tail(md, old); + perf_mmap__write_tail(md, old); } static volatile int done = 0; diff --git a/tools/perf/perf.h b/tools/perf/perf.h index 5fb5e1f11d1c..a5fc660c1f12 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -108,6 +108,18 @@ static inline unsigned int perf_mmap__read_head(struct perf_mmap *mm) return head; } +static inline void perf_mmap__write_tail(struct perf_mmap *md, + unsigned long tail) +{ + struct perf_event_mmap_page *pc = md->base; + + /* + * ensure all reads are done before we write the tail out. + */ + /* mb(); */ + pc->data_tail = tail; +} + /* * prctl(PR_TASK_PERF_EVENTS_DISABLE) will (cheaply) disable all * counters in the current task. From 70db7533caef02350ec8d6852e589491bca3a951 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 12 Jan 2011 22:39:13 -0200 Subject: [PATCH 019/706] perf evlist: Move the mmap array from perf_evsel Adopting the new model used in 'perf record', where we don't have a map per thread per cpu, instead we have an mmap per cpu, established on the first fd for that cpu and ask the kernel using the PERF_EVENT_IOC_SET_OUTPUT ioctl to send events for the other fds on that cpu for the one with the mmap. The methods moved from perf_evsel to perf_evlist, but for easing review they were modified in place, in evsel.c, the next patch will move the migrated methods to evlist.c. With this 'perf top' now uses the same mmap model used by 'perf record' and the next patches will make 'perf record' use these new routines, establishing a common codebase for both tools. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-top.c | 54 ++++++------- tools/perf/util/evlist.c | 27 +++++++ tools/perf/util/evlist.h | 9 +++ tools/perf/util/evsel.c | 158 +++++++++++++++++++++++++++++---------- tools/perf/util/evsel.h | 26 +++++-- 5 files changed, 199 insertions(+), 75 deletions(-) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 7d723ad0bfa9..df85c1f9417b 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -78,7 +78,7 @@ static struct cpu_map *cpus; static int realtime_prio = 0; static bool group = false; static unsigned int page_size; -static unsigned int mmap_pages = 16; +static unsigned int mmap_pages = 128; static int freq = 1000; /* 1 KHz */ static int delay_secs = 2; @@ -991,8 +991,7 @@ static int symbol_filter(struct map *map, struct symbol *sym) static void event__process_sample(const event_t *self, struct sample_data *sample, - struct perf_session *session, - struct perf_evsel *evsel) + struct perf_session *session) { u64 ip = self->ip.ip; struct sym_entry *syme; @@ -1085,8 +1084,12 @@ static void event__process_sample(const event_t *self, syme = symbol__priv(al.sym); if (!syme->skip) { - syme->count[evsel->idx]++; + struct perf_evsel *evsel; + syme->origin = origin; + evsel = perf_evlist__id2evsel(evsel_list, sample->id); + assert(evsel != NULL); + syme->count[evsel->idx]++; record_precise_ip(syme, evsel->idx, ip); pthread_mutex_lock(&active_symbols_lock); if (list_empty(&syme->node) || !syme->node.next) @@ -1095,11 +1098,9 @@ static void event__process_sample(const event_t *self, } } -static void perf_session__mmap_read_counter(struct perf_session *self, - struct perf_evsel *evsel, - int cpu, int thread_idx) +static void perf_session__mmap_read_cpu(struct perf_session *self, int cpu) { - struct perf_mmap *md = xyarray__entry(evsel->mmap, cpu, thread_idx); + struct perf_mmap *md = &evsel_list->mmap[cpu]; unsigned int head = perf_mmap__read_head(md); unsigned int old = md->prev; unsigned char *data = md->base + page_size; @@ -1153,7 +1154,7 @@ static void perf_session__mmap_read_counter(struct perf_session *self, event__parse_sample(event, self, &sample); if (event->header.type == PERF_RECORD_SAMPLE) - event__process_sample(event, &sample, self, evsel); + event__process_sample(event, &sample, self); else event__process(event, &sample, self); old += size; @@ -1164,19 +1165,10 @@ static void perf_session__mmap_read_counter(struct perf_session *self, static void perf_session__mmap_read(struct perf_session *self) { - struct perf_evsel *counter; - int i, thread_index; + int i; - for (i = 0; i < cpus->nr; i++) { - list_for_each_entry(counter, &evsel_list->entries, node) { - for (thread_index = 0; - thread_index < threads->nr; - thread_index++) { - perf_session__mmap_read_counter(self, - counter, i, thread_index); - } - } - } + for (i = 0; i < cpus->nr; i++) + perf_session__mmap_read_cpu(self, i); } static void start_counters(struct perf_evlist *evlist) @@ -1194,6 +1186,11 @@ static void start_counters(struct perf_evlist *evlist) attr->sample_freq = freq; } + if (evlist->nr_entries > 1) { + attr->sample_type |= PERF_SAMPLE_ID; + attr->read_format |= PERF_FORMAT_ID; + } + attr->mmap = 1; try_again: if (perf_evsel__open(counter, cpus, threads, group, inherit) < 0) { @@ -1225,15 +1222,16 @@ try_again: die("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); exit(-1); } - - if (perf_evsel__mmap(counter, cpus, threads, mmap_pages, evlist) < 0) - die("failed to mmap with %d (%s)\n", errno, strerror(errno)); } + + if (perf_evlist__mmap(evlist, cpus, threads, mmap_pages, true) < 0) + die("failed to mmap with %d (%s)\n", errno, strerror(errno)); } static int __cmd_top(void) { pthread_t thread; + struct perf_evsel *first; int ret; /* * FIXME: perf_session__new should allow passing a O_MMAP, so that all this @@ -1249,6 +1247,8 @@ static int __cmd_top(void) event__synthesize_threads(event__process, session); start_counters(evsel_list); + first = list_entry(evsel_list->entries.next, struct perf_evsel, node); + perf_session__set_sample_type(session, first->attr.sample_type); /* Wait for a minimal set of events before starting the snapshot */ poll(evsel_list->pollfd, evsel_list->nr_fds, 100); @@ -1394,8 +1394,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) usage_with_options(top_usage, options); list_for_each_entry(pos, &evsel_list->entries, node) { - if (perf_evsel__alloc_mmap(pos, cpus->nr, threads->nr) < 0 || - perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0) + if (perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0) goto out_free_fd; /* * Fill in the ones not specifically initialized via -c: @@ -1406,7 +1405,8 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) pos->attr.sample_period = default_interval; } - if (perf_evlist__alloc_pollfd(evsel_list, cpus->nr, threads->nr) < 0) + if (perf_evlist__alloc_pollfd(evsel_list, cpus->nr, threads->nr) < 0 || + perf_evlist__alloc_mmap(evsel_list, cpus->nr) < 0) goto out_free_fd; sym_evsel = list_entry(evsel_list->entries.next, struct perf_evsel, node); diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 6d4129214ee8..deb82a4fc312 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -3,11 +3,18 @@ #include "evsel.h" #include "util.h" +#include +#include + struct perf_evlist *perf_evlist__new(void) { struct perf_evlist *evlist = zalloc(sizeof(*evlist)); if (evlist != NULL) { + int i; + + for (i = 0; i < PERF_EVLIST__HLIST_SIZE; ++i) + INIT_HLIST_HEAD(&evlist->heads[i]); INIT_LIST_HEAD(&evlist->entries); } @@ -29,6 +36,7 @@ static void perf_evlist__purge(struct perf_evlist *evlist) void perf_evlist__delete(struct perf_evlist *evlist) { perf_evlist__purge(evlist); + free(evlist->mmap); free(evlist->pollfd); free(evlist); } @@ -68,3 +76,22 @@ void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd) evlist->pollfd[evlist->nr_fds].events = POLLIN; evlist->nr_fds++; } + +struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id) +{ + struct hlist_head *head; + struct hlist_node *pos; + struct perf_sample_id *sid; + int hash; + + if (evlist->nr_entries == 1) + return list_entry(evlist->entries.next, struct perf_evsel, node); + + hash = hash_64(id, PERF_EVLIST__HLIST_BITS); + head = &evlist->heads[hash]; + + hlist_for_each_entry(sid, pos, head, node) + if (sid->id == id) + return sid->evsel; + return NULL; +} diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 16bbfcba8ca8..dbfcc79bb995 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -2,13 +2,20 @@ #define __PERF_EVLIST_H 1 #include +#include "../perf.h" struct pollfd; +#define PERF_EVLIST__HLIST_BITS 8 +#define PERF_EVLIST__HLIST_SIZE (1 << PERF_EVLIST__HLIST_BITS) + struct perf_evlist { struct list_head entries; + struct hlist_head heads[PERF_EVLIST__HLIST_SIZE]; int nr_entries; int nr_fds; + int mmap_len; + struct perf_mmap *mmap; struct pollfd *pollfd; }; @@ -23,4 +30,6 @@ int perf_evlist__add_default(struct perf_evlist *evlist); int perf_evlist__alloc_pollfd(struct perf_evlist *evlist, int ncpus, int nthreads); void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd); +struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id); + #endif /* __PERF_EVLIST_H */ diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index f5006958f8da..ee490356c817 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -8,7 +8,11 @@ #include #include +#include +#include + #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y)) +#define SID(e, x, y) xyarray__entry(e->id, x, y) struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx) { @@ -29,6 +33,12 @@ int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads) return evsel->fd != NULL ? 0 : -ENOMEM; } +int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads) +{ + evsel->id = xyarray__new(ncpus, nthreads, sizeof(struct perf_sample_id)); + return evsel->id != NULL ? 0 : -ENOMEM; +} + int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus) { evsel->counts = zalloc((sizeof(*evsel->counts) + @@ -42,6 +52,12 @@ void perf_evsel__free_fd(struct perf_evsel *evsel) evsel->fd = NULL; } +void perf_evsel__free_id(struct perf_evsel *evsel) +{ + xyarray__delete(evsel->id); + evsel->id = NULL; +} + void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads) { int cpu, thread; @@ -53,32 +69,29 @@ void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads) } } -void perf_evsel__munmap(struct perf_evsel *evsel, int ncpus, int nthreads) +void perf_evlist__munmap(struct perf_evlist *evlist, int ncpus) { - struct perf_mmap *mm; - int cpu, thread; + int cpu; - for (cpu = 0; cpu < ncpus; cpu++) - for (thread = 0; thread < nthreads; ++thread) { - mm = xyarray__entry(evsel->mmap, cpu, thread); - if (mm->base != NULL) { - munmap(mm->base, evsel->mmap_len); - mm->base = NULL; - } + for (cpu = 0; cpu < ncpus; cpu++) { + if (evlist->mmap[cpu].base != NULL) { + munmap(evlist->mmap[cpu].base, evlist->mmap_len); + evlist->mmap[cpu].base = NULL; } + } } -int perf_evsel__alloc_mmap(struct perf_evsel *evsel, int ncpus, int nthreads) +int perf_evlist__alloc_mmap(struct perf_evlist *evlist, int ncpus) { - evsel->mmap = xyarray__new(ncpus, nthreads, sizeof(struct perf_mmap)); - return evsel->mmap != NULL ? 0 : -ENOMEM; + evlist->mmap = zalloc(ncpus * sizeof(struct perf_mmap)); + return evlist->mmap != NULL ? 0 : -ENOMEM; } void perf_evsel__delete(struct perf_evsel *evsel) { assert(list_empty(&evsel->node)); xyarray__delete(evsel->fd); - xyarray__delete(evsel->mmap); + xyarray__delete(evsel->id); free(evsel); } @@ -235,47 +248,110 @@ int perf_evsel__open_per_thread(struct perf_evsel *evsel, return __perf_evsel__open(evsel, &empty_cpu_map.map, threads, group, inherit); } -int perf_evsel__mmap(struct perf_evsel *evsel, struct cpu_map *cpus, - struct thread_map *threads, int pages, - struct perf_evlist *evlist) +static int __perf_evlist__mmap(struct perf_evlist *evlist, int cpu, int prot, + int mask, int fd) +{ + evlist->mmap[cpu].prev = 0; + evlist->mmap[cpu].mask = mask; + evlist->mmap[cpu].base = mmap(NULL, evlist->mmap_len, prot, + MAP_SHARED, fd, 0); + if (evlist->mmap[cpu].base == MAP_FAILED) + return -1; + + perf_evlist__add_pollfd(evlist, fd); + return 0; +} + +static int perf_evlist__id_hash(struct perf_evlist *evlist, struct perf_evsel *evsel, + int cpu, int thread, int fd) +{ + struct perf_sample_id *sid; + u64 read_data[4] = { 0, }; + int hash, id_idx = 1; /* The first entry is the counter value */ + + if (!(evsel->attr.read_format & PERF_FORMAT_ID) || + read(fd, &read_data, sizeof(read_data)) == -1) + return -1; + + if (evsel->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + ++id_idx; + if (evsel->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + ++id_idx; + + sid = SID(evsel, cpu, thread); + sid->id = read_data[id_idx]; + sid->evsel = evsel; + hash = hash_64(sid->id, PERF_EVLIST__HLIST_BITS); + hlist_add_head(&sid->node, &evlist->heads[hash]); + return 0; +} + +/** perf_evlist__mmap - Create per cpu maps to receive events + * + * @evlist - list of events + * @cpus - cpu map being monitored + * @threads - threads map being monitored + * @pages - map length in pages + * @overwrite - overwrite older events? + * + * If overwrite is false the user needs to signal event consuption using: + * + * struct perf_mmap *m = &evlist->mmap[cpu]; + * unsigned int head = perf_mmap__read_head(m); + * + * perf_mmap__write_tail(m, head) + */ +int perf_evlist__mmap(struct perf_evlist *evlist, struct cpu_map *cpus, + struct thread_map *threads, int pages, bool overwrite) { unsigned int page_size = sysconf(_SC_PAGE_SIZE); int mask = pages * page_size - 1, cpu; - struct perf_mmap *mm; - int thread; + struct perf_evsel *first_evsel, *evsel; + int thread, prot = PROT_READ | (overwrite ? 0 : PROT_WRITE); - if (evsel->mmap == NULL && - perf_evsel__alloc_mmap(evsel, cpus->nr, threads->nr) < 0) + if (evlist->mmap == NULL && + perf_evlist__alloc_mmap(evlist, cpus->nr) < 0) return -ENOMEM; - evsel->mmap_len = (pages + 1) * page_size; + if (evlist->pollfd == NULL && + perf_evlist__alloc_pollfd(evlist, cpus->nr, threads->nr) < 0) + return -ENOMEM; - for (cpu = 0; cpu < cpus->nr; cpu++) { - for (thread = 0; thread < threads->nr; thread++) { - mm = xyarray__entry(evsel->mmap, cpu, thread); - mm->prev = 0; - mm->mask = mask; - mm->base = mmap(NULL, evsel->mmap_len, PROT_READ, - MAP_SHARED, FD(evsel, cpu, thread), 0); - if (mm->base == MAP_FAILED) - goto out_unmap; + evlist->mmap_len = (pages + 1) * page_size; + first_evsel = list_entry(evlist->entries.next, struct perf_evsel, node); - if (evlist != NULL) - perf_evlist__add_pollfd(evlist, FD(evsel, cpu, thread)); + list_for_each_entry(evsel, &evlist->entries, node) { + if ((evsel->attr.read_format & PERF_FORMAT_ID) && + evsel->id == NULL && + perf_evsel__alloc_id(evsel, cpus->nr, threads->nr) < 0) + return -ENOMEM; + + for (cpu = 0; cpu < cpus->nr; cpu++) { + for (thread = 0; thread < threads->nr; thread++) { + int fd = FD(evsel, cpu, thread); + + if (evsel->idx || thread) { + if (ioctl(fd, PERF_EVENT_IOC_SET_OUTPUT, + FD(first_evsel, cpu, 0)) != 0) + goto out_unmap; + } else if (__perf_evlist__mmap(evlist, cpu, prot, mask, fd) < 0) + goto out_unmap; + + if ((evsel->attr.read_format & PERF_FORMAT_ID) && + perf_evlist__id_hash(evlist, evsel, cpu, thread, fd) < 0) + goto out_unmap; + } } } return 0; out_unmap: - do { - while (--thread >= 0) { - mm = xyarray__entry(evsel->mmap, cpu, thread); - munmap(mm->base, evsel->mmap_len); - mm->base = NULL; + for (cpu = 0; cpu < cpus->nr; cpu++) { + if (evlist->mmap[cpu].base != NULL) { + munmap(evlist->mmap[cpu].base, evlist->mmap_len); + evlist->mmap[cpu].base = NULL; } - thread = threads->nr; - } while (--cpu >= 0); - + } return -1; } diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index c8fbef299436..667ee4e2e35e 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -24,14 +24,25 @@ struct perf_counts { struct perf_counts_values cpu[]; }; +struct perf_evsel; + +/* + * Per fd, to map back from PERF_SAMPLE_ID to evsel, only used when there are + * more than one entry in the evlist. + */ +struct perf_sample_id { + struct hlist_node node; + u64 id; + struct perf_evsel *evsel; +}; + struct perf_evsel { struct list_head node; struct perf_event_attr attr; char *filter; struct xyarray *fd; - struct xyarray *mmap; + struct xyarray *id; struct perf_counts *counts; - size_t mmap_len; int idx; void *priv; }; @@ -44,9 +55,11 @@ struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx); void perf_evsel__delete(struct perf_evsel *evsel); int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads); +int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads); int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus); -int perf_evsel__alloc_mmap(struct perf_evsel *evsel, int ncpus, int nthreads); +int perf_evlist__alloc_mmap(struct perf_evlist *evlist, int ncpus); void perf_evsel__free_fd(struct perf_evsel *evsel); +void perf_evsel__free_id(struct perf_evsel *evsel); void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads); int perf_evsel__open_per_cpu(struct perf_evsel *evsel, @@ -55,10 +68,9 @@ int perf_evsel__open_per_thread(struct perf_evsel *evsel, struct thread_map *threads, bool group, bool inherit); int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus, struct thread_map *threads, bool group, bool inherit); -int perf_evsel__mmap(struct perf_evsel *evsel, struct cpu_map *cpus, - struct thread_map *threads, int pages, - struct perf_evlist *evlist); -void perf_evsel__munmap(struct perf_evsel *evsel, int ncpus, int nthreads); +int perf_evlist__mmap(struct perf_evlist *evlist, struct cpu_map *cpus, + struct thread_map *threads, int pages, bool overwrite); +void perf_evlist__munmap(struct perf_evlist *evlist, int ncpus); #define perf_evsel__match(evsel, t, c) \ (evsel->attr.type == PERF_TYPE_##t && \ From 0a27d7f9f417c0305f7efa70631764a53c7af219 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 14 Jan 2011 15:50:51 -0200 Subject: [PATCH 020/706] perf record: Use perf_evlist__mmap There is more stuff that can go to the perf_ev{sel,list} layer, like detecting if sample_id_all is available, etc, but lets try using this in 'perf test' first. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 59 +++++++------------------------------ 1 file changed, 11 insertions(+), 48 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 109f3b269ac5..45a3689f9ed6 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -30,6 +30,7 @@ #include #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y)) +#define SID(e, x, y) xyarray__entry(e->id, x, y) enum write_mode_t { WRITE_FORCE, @@ -78,8 +79,6 @@ static off_t post_processing_offset; static struct perf_session *session; static const char *cpu_list; -static struct perf_mmap mmap_array[MAX_NR_CPUS]; - static void advance_output(size_t size) { bytes_written += size; @@ -196,20 +195,14 @@ static struct perf_header_attr *get_header_attr(struct perf_event_attr *a, int n return h_attr; } -static void create_counter(struct perf_evlist *evlist, - struct perf_evsel *evsel, int cpu) +static void create_counter(struct perf_evsel *evsel, int cpu) { char *filter = evsel->filter; struct perf_event_attr *attr = &evsel->attr; struct perf_header_attr *h_attr; + struct perf_sample_id *sid; int thread_index; int ret; - struct { - u64 count; - u64 time_enabled; - u64 time_running; - u64 id; - } read_data; for (thread_index = 0; thread_index < threads->nr; thread_index++) { h_attr = get_header_attr(attr, evsel->idx); @@ -223,45 +216,12 @@ static void create_counter(struct perf_evlist *evlist, } } - if (read(FD(evsel, cpu, thread_index), &read_data, sizeof(read_data)) == -1) { - perror("Unable to read perf file descriptor"); - exit(-1); - } - - if (perf_header_attr__add_id(h_attr, read_data.id) < 0) { + sid = SID(evsel, cpu, thread_index); + if (perf_header_attr__add_id(h_attr, sid->id) < 0) { pr_warning("Not enough memory to add id\n"); exit(-1); } - assert(FD(evsel, cpu, thread_index) >= 0); - fcntl(FD(evsel, cpu, thread_index), F_SETFL, O_NONBLOCK); - - if (evsel->idx || thread_index) { - struct perf_evsel *first; - first = list_entry(evlist->entries.next, struct perf_evsel, node); - ret = ioctl(FD(evsel, cpu, thread_index), - PERF_EVENT_IOC_SET_OUTPUT, - FD(first, cpu, 0)); - if (ret) { - error("failed to set output: %d (%s)\n", errno, - strerror(errno)); - exit(-1); - } - } else { - mmap_array[cpu].prev = 0; - mmap_array[cpu].mask = mmap_pages*page_size - 1; - mmap_array[cpu].base = mmap(NULL, (mmap_pages+1)*page_size, - PROT_READ | PROT_WRITE, MAP_SHARED, FD(evsel, cpu, thread_index), 0); - if (mmap_array[cpu].base == MAP_FAILED) { - error("failed to mmap with %d (%s)\n", errno, strerror(errno)); - exit(-1); - } - - evlist->pollfd[evlist->nr_fds].fd = FD(evsel, cpu, thread_index); - evlist->pollfd[evlist->nr_fds].events = POLLIN; - evlist->nr_fds++; - } - if (filter != NULL) { ret = ioctl(FD(evsel, cpu, thread_index), PERF_EVENT_IOC_SET_FILTER, filter); @@ -423,9 +383,12 @@ try_again: } } + if (perf_evlist__mmap(evlist, cpus, threads, mmap_pages, false) < 0) + die("failed to mmap with %d (%s)\n", errno, strerror(errno)); + for (cpu = 0; cpu < cpus->nr; ++cpu) { list_for_each_entry(pos, &evlist->entries, node) - create_counter(evlist, pos, cpu); + create_counter(pos, cpu); } } @@ -502,8 +465,8 @@ static void mmap_read_all(void) int i; for (i = 0; i < cpus->nr; i++) { - if (mmap_array[i].base) - mmap_read(&mmap_array[i]); + if (evsel_list->mmap[i].base) + mmap_read(&evsel_list->mmap[i]); } if (perf_header__has_feat(&session->header, HEADER_TRACE_INFO)) From 915fce20ecf8f7ff4189d0fff42b62aebf6a57cc Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 14 Jan 2011 16:19:12 -0200 Subject: [PATCH 021/706] perf tools: Add missing cpu_map__delete() Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cpumap.c | 5 +++++ tools/perf/util/cpumap.h | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index 3ccaa1043383..6893eec693ab 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -177,3 +177,8 @@ struct cpu_map *cpu_map__dummy_new(void) return cpus; } + +void cpu_map__delete(struct cpu_map *map) +{ + free(map); +} diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index f7a4f42f6307..072c0a374794 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -8,6 +8,6 @@ struct cpu_map { struct cpu_map *cpu_map__new(const char *cpu_list); struct cpu_map *cpu_map__dummy_new(void); -void *cpu_map__delete(struct cpu_map *map); +void cpu_map__delete(struct cpu_map *map); #endif /* __PERF_CPUMAP_H */ From d2af9687c96f3864178de1860e6d83873aeef224 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 14 Jan 2011 16:24:49 -0200 Subject: [PATCH 022/706] perf test: Check counts on all cpus in test__open_syscall_event_on_all_cpus We were bailing out after the first count mismatch, do it in all to see if only some CPUs are not getting the expected number of events. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-test.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c index 7287158c4830..7cc6b2086947 100644 --- a/tools/perf/builtin-test.c +++ b/tools/perf/builtin-test.c @@ -408,6 +408,8 @@ static int test__open_syscall_event_on_all_cpus(void) goto out_close_fd; } + err = 0; + for (cpu = 0; cpu < cpus->nr; ++cpu) { unsigned int expected; @@ -416,18 +418,18 @@ static int test__open_syscall_event_on_all_cpus(void) if (perf_evsel__read_on_cpu(evsel, cpu, 0) < 0) { pr_debug("perf_evsel__open_read_on_cpu\n"); - goto out_close_fd; + err = -1; + break; } expected = nr_open_calls + cpu; if (evsel->counts->cpu[cpu].val != expected) { pr_debug("perf_evsel__read_on_cpu: expected to intercept %d calls on cpu %d, got %" PRIu64 "\n", expected, cpus->map[cpu], evsel->counts->cpu[cpu].val); - goto out_close_fd; + err = -1; } } - err = 0; out_close_fd: perf_evsel__close_fd(evsel, 1, threads->nr); out_evsel_delete: From 98d77b78504a423fca911a26a17bee00ef2fdda2 Mon Sep 17 00:00:00 2001 From: Han Pingtian Date: Sat, 15 Jan 2011 07:00:50 +0800 Subject: [PATCH 023/706] perf test: check if cpu_map__new() return NULL It looks like we should check if cpus is NULL after cpus = cpu_map__new(NULL); in test__open_syscall_event_on_all_cpus(). LKML-Reference: <20110114230050.GA7011@localhost> Signed-off-by: Han Pingtian Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-test.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c index 7cc6b2086947..5a50e4755e6c 100644 --- a/tools/perf/builtin-test.c +++ b/tools/perf/builtin-test.c @@ -347,9 +347,9 @@ static int test__open_syscall_event_on_all_cpus(void) } cpus = cpu_map__new(NULL); - if (threads == NULL) { - pr_debug("thread_map__new\n"); - return -1; + if (cpus == NULL) { + pr_debug("cpu_map__new\n"); + goto out_thread_map_delete; } From 04391debc3e1195222a4dbb162ace6542dd89c1c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 15 Jan 2011 10:40:59 -0200 Subject: [PATCH 024/706] perf evlist: Steal mmap reading routine from 'perf top' Will be used in the upcoming 'perf test' entry for the evlist mmap routines. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-top.c | 56 ++---------------------------------- tools/perf/util/evlist.c | 62 ++++++++++++++++++++++++++++++++++++++++ tools/perf/util/evlist.h | 4 +++ 3 files changed, 69 insertions(+), 53 deletions(-) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index df85c1f9417b..58352ad807c7 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1100,67 +1100,17 @@ static void event__process_sample(const event_t *self, static void perf_session__mmap_read_cpu(struct perf_session *self, int cpu) { - struct perf_mmap *md = &evsel_list->mmap[cpu]; - unsigned int head = perf_mmap__read_head(md); - unsigned int old = md->prev; - unsigned char *data = md->base + page_size; struct sample_data sample; - int diff; - - /* - * If we're further behind than half the buffer, there's a chance - * the writer will bite our tail and mess up the samples under us. - * - * If we somehow ended up ahead of the head, we got messed up. - * - * In either case, truncate and restart at head. - */ - diff = head - old; - if (diff > md->mask / 2 || diff < 0) { - fprintf(stderr, "WARNING: failed to keep up with mmap data.\n"); - - /* - * head points to a known good entry, start there. - */ - old = head; - } - - for (; old != head;) { - event_t *event = (event_t *)&data[old & md->mask]; - - event_t event_copy; - - size_t size = event->header.size; - - /* - * Event straddles the mmap boundary -- header should always - * be inside due to u64 alignment of output. - */ - if ((old & md->mask) + size != ((old + size) & md->mask)) { - unsigned int offset = old; - unsigned int len = min(sizeof(*event), size), cpy; - void *dst = &event_copy; - - do { - cpy = min(md->mask + 1 - (offset & md->mask), len); - memcpy(dst, &data[offset & md->mask], cpy); - offset += cpy; - dst += cpy; - len -= cpy; - } while (len); - - event = &event_copy; - } + event_t *event; + while ((event = perf_evlist__read_on_cpu(evsel_list, cpu)) != NULL) { event__parse_sample(event, self, &sample); + if (event->header.type == PERF_RECORD_SAMPLE) event__process_sample(event, &sample, self); else event__process(event, &sample, self); - old += size; } - - md->prev = old; } static void perf_session__mmap_read(struct perf_session *self) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index deb82a4fc312..4b3b84cd71a1 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -95,3 +95,65 @@ struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id) return sid->evsel; return NULL; } + +event_t *perf_evlist__read_on_cpu(struct perf_evlist *evlist, int cpu) +{ + /* XXX Move this to perf.c, making it generally available */ + unsigned int page_size = sysconf(_SC_PAGE_SIZE); + struct perf_mmap *md = &evlist->mmap[cpu]; + unsigned int head = perf_mmap__read_head(md); + unsigned int old = md->prev; + unsigned char *data = md->base + page_size; + event_t *event = NULL; + int diff; + + /* + * If we're further behind than half the buffer, there's a chance + * the writer will bite our tail and mess up the samples under us. + * + * If we somehow ended up ahead of the head, we got messed up. + * + * In either case, truncate and restart at head. + */ + diff = head - old; + if (diff > md->mask / 2 || diff < 0) { + fprintf(stderr, "WARNING: failed to keep up with mmap data.\n"); + + /* + * head points to a known good entry, start there. + */ + old = head; + } + + if (old != head) { + size_t size; + + event = (event_t *)&data[old & md->mask]; + size = event->header.size; + + /* + * Event straddles the mmap boundary -- header should always + * be inside due to u64 alignment of output. + */ + if ((old & md->mask) + size != ((old + size) & md->mask)) { + unsigned int offset = old; + unsigned int len = min(sizeof(*event), size), cpy; + void *dst = &evlist->event_copy; + + do { + cpy = min(md->mask + 1 - (offset & md->mask), len); + memcpy(dst, &data[offset & md->mask], cpy); + offset += cpy; + dst += cpy; + len -= cpy; + } while (len); + + event = &evlist->event_copy; + } + + old += size; + } + + md->prev = old; + return event; +} diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index dbfcc79bb995..28712063db97 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -3,6 +3,7 @@ #include #include "../perf.h" +#include "event.h" struct pollfd; @@ -15,6 +16,7 @@ struct perf_evlist { int nr_entries; int nr_fds; int mmap_len; + event_t event_copy; struct perf_mmap *mmap; struct pollfd *pollfd; }; @@ -32,4 +34,6 @@ void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd); struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id); +event_t *perf_evlist__read_on_cpu(struct perf_evlist *self, int cpu); + #endif /* __PERF_EVLIST_H */ From de5fa3a8a05cd60f59622e88cfeb90416760d78e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 15 Jan 2011 10:42:46 -0200 Subject: [PATCH 025/706] perf test: Add test for the evlist mmap routines This test will generate random numbers of calls to some getpid syscalls, then establish an mmap for a group of events that are created to monitor these syscalls. It will receive the events, using mmap, use its PERF_SAMPLE_ID generated sample.id field to map back to its respective perf_evsel instance. Then it checks if the number of syscalls reported as perf events by the kernel corresponds to the number of syscalls made. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-test.c | 171 +++++++++++++++++++++++++++++++++++++- 1 file changed, 169 insertions(+), 2 deletions(-) diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c index 5a50e4755e6c..4fd34537c01d 100644 --- a/tools/perf/builtin-test.c +++ b/tools/perf/builtin-test.c @@ -7,7 +7,9 @@ #include "util/cache.h" #include "util/debug.h" +#include "util/evlist.h" #include "util/parse-options.h" +#include "util/parse-events.h" #include "util/session.h" #include "util/symbol.h" #include "util/thread.h" @@ -238,14 +240,14 @@ out: #include "util/evsel.h" #include -static int trace_event__id(const char *event_name) +static int trace_event__id(const char *evname) { char *filename; int err = -1, fd; if (asprintf(&filename, "/sys/kernel/debug/tracing/events/syscalls/%s/id", - event_name) < 0) + evname) < 0) return -1; fd = open(filename, O_RDONLY); @@ -439,6 +441,167 @@ out_thread_map_delete: return err; } +/* + * This test will generate random numbers of calls to some getpid syscalls, + * then establish an mmap for a group of events that are created to monitor + * the syscalls. + * + * It will receive the events, using mmap, use its PERF_SAMPLE_ID generated + * sample.id field to map back to its respective perf_evsel instance. + * + * Then it checks if the number of syscalls reported as perf events by + * the kernel corresponds to the number of syscalls made. + */ +static int test__basic_mmap(void) +{ + int err = -1; + event_t *event; + struct thread_map *threads; + struct perf_session session; + struct cpu_map *cpus; + struct perf_evlist *evlist; + struct perf_event_attr attr = { + .type = PERF_TYPE_TRACEPOINT, + .read_format = PERF_FORMAT_ID, + .sample_type = PERF_SAMPLE_ID, + .watermark = 0, + }; + cpu_set_t cpu_set; + const char *syscall_names[] = { "getsid", "getppid", "getpgrp", + "getpgid", }; + pid_t (*syscalls[])(void) = { (void *)getsid, getppid, getpgrp, + (void*)getpgid }; +#define nsyscalls ARRAY_SIZE(syscall_names) + int ids[nsyscalls]; + unsigned int nr_events[nsyscalls], + expected_nr_events[nsyscalls], i, j; + struct perf_evsel *evsels[nsyscalls], *evsel; + + for (i = 0; i < nsyscalls; ++i) { + char name[64]; + + snprintf(name, sizeof(name), "sys_enter_%s", syscall_names[i]); + ids[i] = trace_event__id(name); + if (ids[i] < 0) { + pr_debug("Is debugfs mounted on /sys/kernel/debug?\n"); + return -1; + } + nr_events[i] = 0; + expected_nr_events[i] = random() % 257; + } + + threads = thread_map__new(-1, getpid()); + if (threads == NULL) { + pr_debug("thread_map__new\n"); + return -1; + } + + cpus = cpu_map__new(NULL); + if (threads == NULL) { + pr_debug("thread_map__new\n"); + goto out_free_threads; + } + + CPU_ZERO(&cpu_set); + CPU_SET(cpus->map[0], &cpu_set); + sched_setaffinity(0, sizeof(cpu_set), &cpu_set); + if (sched_setaffinity(0, sizeof(cpu_set), &cpu_set) < 0) { + pr_debug("sched_setaffinity() failed on CPU %d: %s ", + cpus->map[0], strerror(errno)); + goto out_free_cpus; + } + + evlist = perf_evlist__new(); + if (threads == NULL) { + pr_debug("perf_evlist__new\n"); + goto out_free_cpus; + } + + /* anonymous union fields, can't be initialized above */ + attr.wakeup_events = 1; + attr.sample_period = 1; + + /* + * FIXME: use evsel->attr.sample_type in event__parse_sample. + * This will nicely remove the requirement that we have + * all the events with the same sample_type. + */ + session.sample_type = attr.sample_type; + + for (i = 0; i < nsyscalls; ++i) { + attr.config = ids[i]; + evsels[i] = perf_evsel__new(&attr, i); + if (evsels[i] == NULL) { + pr_debug("perf_evsel__new\n"); + goto out_free_evlist; + } + + perf_evlist__add(evlist, evsels[i]); + + if (perf_evsel__open(evsels[i], cpus, threads, false, false) < 0) { + pr_debug("failed to open counter: %s, " + "tweak /proc/sys/kernel/perf_event_paranoid?\n", + strerror(errno)); + goto out_close_fd; + } + } + + if (perf_evlist__mmap(evlist, cpus, threads, 128, true) < 0) { + pr_debug("failed to mmap events: %d (%s)\n", errno, + strerror(errno)); + goto out_close_fd; + } + + for (i = 0; i < nsyscalls; ++i) + for (j = 0; j < expected_nr_events[i]; ++j) { + int foo = syscalls[i](); + ++foo; + } + + while ((event = perf_evlist__read_on_cpu(evlist, 0)) != NULL) { + struct sample_data sample; + + if (event->header.type != PERF_RECORD_SAMPLE) { + pr_debug("unexpected %s event\n", + event__get_event_name(event->header.type)); + goto out_munmap; + } + + event__parse_sample(event, &session, &sample); + evsel = perf_evlist__id2evsel(evlist, sample.id); + if (evsel == NULL) { + pr_debug("event with id %" PRIu64 + " doesn't map to an evsel\n", sample.id); + goto out_munmap; + } + nr_events[evsel->idx]++; + } + + list_for_each_entry(evsel, &evlist->entries, node) { + if (nr_events[evsel->idx] != expected_nr_events[evsel->idx]) { + pr_debug("expected %d %s events, got %d\n", + expected_nr_events[evsel->idx], + event_name(evsel), nr_events[evsel->idx]); + goto out_munmap; + } + } + + err = 0; +out_munmap: + perf_evlist__munmap(evlist, 1); +out_close_fd: + for (i = 0; i < nsyscalls; ++i) + perf_evsel__close_fd(evsels[i], 1, threads->nr); +out_free_evlist: + perf_evlist__delete(evlist); +out_free_cpus: + cpu_map__delete(cpus); +out_free_threads: + thread_map__delete(threads); + return err; +#undef nsyscalls +} + static struct test { const char *desc; int (*func)(void); @@ -455,6 +618,10 @@ static struct test { .desc = "detect open syscall event on all cpus", .func = test__open_syscall_event_on_all_cpus, }, + { + .desc = "read samples using the mmap interface", + .func = test__basic_mmap, + }, { .func = NULL, }, From 1b3a0e9592ebf174af934b3908a2bf6a6fa86169 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 14 Jan 2011 04:51:58 +0100 Subject: [PATCH 026/706] perf callchain: Feed callchains into a cursor The callchains are fed with an array of a fixed size. As a result we iterate over each callchains three times: - 1st to resolve symbols - 2nd to filter out context boundaries - 3rd for the insertion into the tree This also involves some pairs of memory allocation/deallocation everytime we insert a callchain, for the filtered out array of addresses and for the array of symbols that comes along. Instead, feed the callchains through a linked list with persistent allocations. It brings several pros like: - Merge the 1st and 2nd iterations in one. That was possible before but in a way that would involve allocating an array slightly taller than necessary because we don't know in advance the number of context boundaries to filter out. - Much lesser allocations/deallocations. The linked list keeps persistent empty entries for the next usages and is extendable at will. - Makes it easier for multiple sources of callchains to feed a stacktrace together. This is deemed to pave the way for cfi based callchains wherein traditional frame pointer based kernel stacktraces will precede cfi based user ones, producing an overall callchain which size is hardly predictable. This requirement makes the static array obsolete and makes a linked list based iterator a much more flexible fit. Basic testing on a big perf file containing callchains (~ 176 MB) has shown a throughput gain of about 11% with perf report. Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra LKML-Reference: <1294977121-5700-2-git-send-email-fweisbec@gmail.com> Signed-off-by: Frederic Weisbecker Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 25 ++--- tools/perf/util/callchain.c | 214 ++++++++++++++++++------------------ tools/perf/util/callchain.h | 66 ++++++++++- tools/perf/util/hist.c | 13 ++- tools/perf/util/hist.h | 2 + tools/perf/util/session.c | 22 ++-- tools/perf/util/session.h | 11 +- 7 files changed, 209 insertions(+), 144 deletions(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index c27e31f289e6..c95599a82f9e 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -81,18 +81,17 @@ static int perf_session__add_hist_entry(struct perf_session *self, struct addr_location *al, struct sample_data *data) { - struct map_symbol *syms = NULL; struct symbol *parent = NULL; - int err = -ENOMEM; + int err = 0; struct hist_entry *he; struct hists *hists; struct perf_event_attr *attr; if ((sort__has_parent || symbol_conf.use_callchain) && data->callchain) { - syms = perf_session__resolve_callchain(self, al->thread, - data->callchain, &parent); - if (syms == NULL) - return -ENOMEM; + err = perf_session__resolve_callchain(self, al->thread, + data->callchain, &parent); + if (err) + return err; } attr = perf_header__find_attr(data->id, &self->header); @@ -101,16 +100,17 @@ static int perf_session__add_hist_entry(struct perf_session *self, else hists = perf_session__hists_findnew(self, data->id, 0, 0); if (hists == NULL) - goto out_free_syms; + return -ENOMEM; + he = __hists__add_entry(hists, al, parent, data->period); if (he == NULL) - goto out_free_syms; - err = 0; + return -ENOMEM; + if (symbol_conf.use_callchain) { - err = callchain_append(he->callchain, data->callchain, syms, + err = callchain_append(he->callchain, &self->callchain_cursor, data->period); if (err) - goto out_free_syms; + return err; } /* * Only in the newt browser we are doing integrated annotation, @@ -119,8 +119,7 @@ static int perf_session__add_hist_entry(struct perf_session *self, */ if (use_browser > 0) err = hist_entry__inc_addr_samples(he, al->addr); -out_free_syms: - free(syms); + return err; } diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index e12d539417b2..53a49e0cfc6c 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2009-2010, Frederic Weisbecker + * Copyright (C) 2009-2011, Frederic Weisbecker * * Handle the callchains from the stream in an ad-hoc radix tree and then * sort them in an rbtree. @@ -195,26 +195,21 @@ create_child(struct callchain_node *parent, bool inherit_children) } -struct resolved_ip { - u64 ip; - struct map_symbol ms; -}; - -struct resolved_chain { - u64 nr; - struct resolved_ip ips[0]; -}; - - /* * Fill the node with callchain values */ static void -fill_node(struct callchain_node *node, struct resolved_chain *chain, int start) +fill_node(struct callchain_node *node, struct callchain_cursor *cursor) { - unsigned int i; + struct callchain_cursor_node *cursor_node; - for (i = start; i < chain->nr; i++) { + node->val_nr = cursor->nr - cursor->pos; + if (!node->val_nr) + pr_warning("Warning: empty node in callchain tree\n"); + + cursor_node = callchain_cursor_current(cursor); + + while (cursor_node) { struct callchain_list *call; call = zalloc(sizeof(*call)); @@ -222,23 +217,25 @@ fill_node(struct callchain_node *node, struct resolved_chain *chain, int start) perror("not enough memory for the code path tree"); return; } - call->ip = chain->ips[i].ip; - call->ms = chain->ips[i].ms; + call->ip = cursor_node->ip; + call->ms.sym = cursor_node->sym; + call->ms.map = cursor_node->map; list_add_tail(&call->list, &node->val); + + callchain_cursor_advance(cursor); + cursor_node = callchain_cursor_current(cursor); } - node->val_nr = chain->nr - start; - if (!node->val_nr) - pr_warning("Warning: empty node in callchain tree\n"); } static void -add_child(struct callchain_node *parent, struct resolved_chain *chain, - int start, u64 period) +add_child(struct callchain_node *parent, + struct callchain_cursor *cursor, + u64 period) { struct callchain_node *new; new = create_child(parent, false); - fill_node(new, chain, start); + fill_node(new, cursor); new->children_hit = 0; new->hit = period; @@ -250,9 +247,10 @@ add_child(struct callchain_node *parent, struct resolved_chain *chain, * Then create another child to host the given callchain of new branch */ static void -split_add_child(struct callchain_node *parent, struct resolved_chain *chain, - struct callchain_list *to_split, int idx_parents, int idx_local, - u64 period) +split_add_child(struct callchain_node *parent, + struct callchain_cursor *cursor, + struct callchain_list *to_split, + u64 idx_parents, u64 idx_local, u64 period) { struct callchain_node *new; struct list_head *old_tail; @@ -277,9 +275,9 @@ split_add_child(struct callchain_node *parent, struct resolved_chain *chain, parent->val_nr = idx_local; /* create a new child for the new branch if any */ - if (idx_total < chain->nr) { + if (idx_total < cursor->nr) { parent->hit = 0; - add_child(parent, chain, idx_total, period); + add_child(parent, cursor, period); parent->children_hit += period; } else { parent->hit = period; @@ -287,36 +285,41 @@ split_add_child(struct callchain_node *parent, struct resolved_chain *chain, } static int -append_chain(struct callchain_node *root, struct resolved_chain *chain, - unsigned int start, u64 period); +append_chain(struct callchain_node *root, + struct callchain_cursor *cursor, + u64 period); static void -append_chain_children(struct callchain_node *root, struct resolved_chain *chain, - unsigned int start, u64 period) +append_chain_children(struct callchain_node *root, + struct callchain_cursor *cursor, + u64 period) { struct callchain_node *rnode; /* lookup in childrens */ chain_for_each_child(rnode, root) { - unsigned int ret = append_chain(rnode, chain, start, period); + unsigned int ret = append_chain(rnode, cursor, period); if (!ret) goto inc_children_hit; } /* nothing in children, add to the current node */ - add_child(root, chain, start, period); + add_child(root, cursor, period); inc_children_hit: root->children_hit += period; } static int -append_chain(struct callchain_node *root, struct resolved_chain *chain, - unsigned int start, u64 period) +append_chain(struct callchain_node *root, + struct callchain_cursor *cursor, + u64 period) { + struct callchain_cursor_node *curr_snap = cursor->curr; struct callchain_list *cnode; - unsigned int i = start; + u64 start = cursor->pos; bool found = false; + u64 matches; /* * Lookup in the current node @@ -324,114 +327,95 @@ append_chain(struct callchain_node *root, struct resolved_chain *chain, * anywhere inside a function. */ list_for_each_entry(cnode, &root->val, list) { + struct callchain_cursor_node *node; struct symbol *sym; - if (i == chain->nr) + node = callchain_cursor_current(cursor); + if (!node) break; - sym = chain->ips[i].ms.sym; + sym = node->sym; if (cnode->ms.sym && sym) { if (cnode->ms.sym->start != sym->start) break; - } else if (cnode->ip != chain->ips[i].ip) + } else if (cnode->ip != node->ip) break; if (!found) found = true; - i++; + + callchain_cursor_advance(cursor); } /* matches not, relay on the parent */ - if (!found) + if (!found) { + cursor->curr = curr_snap; + cursor->pos = start; return -1; + } + + matches = cursor->pos - start; /* we match only a part of the node. Split it and add the new chain */ - if (i - start < root->val_nr) { - split_add_child(root, chain, cnode, start, i - start, period); + if (matches < root->val_nr) { + split_add_child(root, cursor, cnode, start, matches, period); return 0; } /* we match 100% of the path, increment the hit */ - if (i - start == root->val_nr && i == chain->nr) { + if (matches == root->val_nr && cursor->pos == cursor->nr) { root->hit += period; return 0; } /* We match the node and still have a part remaining */ - append_chain_children(root, chain, i, period); + append_chain_children(root, cursor, period); return 0; } -static void filter_context(struct ip_callchain *old, struct resolved_chain *new, - struct map_symbol *syms) +int callchain_append(struct callchain_root *root, + struct callchain_cursor *cursor, + u64 period) { - int i, j = 0; - - for (i = 0; i < (int)old->nr; i++) { - if (old->ips[i] >= PERF_CONTEXT_MAX) - continue; - - new->ips[j].ip = old->ips[i]; - new->ips[j].ms = syms[i]; - j++; - } - - new->nr = j; -} - - -int callchain_append(struct callchain_root *root, struct ip_callchain *chain, - struct map_symbol *syms, u64 period) -{ - struct resolved_chain *filtered; - - if (!chain->nr) + if (!cursor->nr) return 0; - filtered = zalloc(sizeof(*filtered) + - chain->nr * sizeof(struct resolved_ip)); - if (!filtered) - return -ENOMEM; + callchain_cursor_commit(cursor); - filter_context(chain, filtered, syms); + append_chain_children(&root->node, cursor, period); - if (!filtered->nr) - goto end; - - append_chain_children(&root->node, filtered, 0, period); - - if (filtered->nr > root->max_depth) - root->max_depth = filtered->nr; -end: - free(filtered); + if (cursor->nr > root->max_depth) + root->max_depth = cursor->nr; return 0; } static int -merge_chain_branch(struct callchain_node *dst, struct callchain_node *src, - struct resolved_chain *chain) +merge_chain_branch(struct callchain_cursor *cursor, + struct callchain_node *dst, struct callchain_node *src) { + struct callchain_cursor_node **old_last = cursor->last; struct callchain_node *child, *next_child; struct callchain_list *list, *next_list; - int old_pos = chain->nr; + int old_pos = cursor->nr; int err = 0; list_for_each_entry_safe(list, next_list, &src->val, list) { - chain->ips[chain->nr].ip = list->ip; - chain->ips[chain->nr].ms = list->ms; - chain->nr++; + callchain_cursor_append(cursor, list->ip, + list->ms.map, list->ms.sym); list_del(&list->list); free(list); } - if (src->hit) - append_chain_children(dst, chain, 0, src->hit); + if (src->hit) { + callchain_cursor_commit(cursor); + append_chain_children(dst, cursor, src->hit); + } chain_for_each_child_safe(child, next_child, src) { - err = merge_chain_branch(dst, child, chain); + err = merge_chain_branch(cursor, dst, child); if (err) break; @@ -439,26 +423,38 @@ merge_chain_branch(struct callchain_node *dst, struct callchain_node *src, free(child); } - chain->nr = old_pos; + cursor->nr = old_pos; + cursor->last = old_last; return err; } -int callchain_merge(struct callchain_root *dst, struct callchain_root *src) +int callchain_merge(struct callchain_cursor *cursor, + struct callchain_root *dst, struct callchain_root *src) { - struct resolved_chain *chain; - int err; - - chain = malloc(sizeof(*chain) + - src->max_depth * sizeof(struct resolved_ip)); - if (!chain) - return -ENOMEM; - - chain->nr = 0; - - err = merge_chain_branch(&dst->node, &src->node, chain); - - free(chain); - - return err; + return merge_chain_branch(cursor, &dst->node, &src->node); +} + +int callchain_cursor_append(struct callchain_cursor *cursor, + u64 ip, struct map *map, struct symbol *sym) +{ + struct callchain_cursor_node *node = *cursor->last; + + if (!node) { + node = calloc(sizeof(*node), 1); + if (!node) + return -ENOMEM; + + *cursor->last = node; + } + + node->ip = ip; + node->map = map; + node->sym = sym; + + cursor->nr++; + + cursor->last = &node->next; + + return 0; } diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index c15fb8c24ad2..d74a19af4a44 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -49,6 +49,27 @@ struct callchain_list { struct list_head list; }; +/* + * A callchain cursor is a single linked list that + * let one feed a callchain progressively. + * It keeps persitent allocated entries to minimize + * allocations. + */ +struct callchain_cursor_node { + u64 ip; + struct map *map; + struct symbol *sym; + struct callchain_cursor_node *next; +}; + +struct callchain_cursor { + u64 nr; + struct callchain_cursor_node *first; + struct callchain_cursor_node **last; + u64 pos; + struct callchain_cursor_node *curr; +}; + static inline void callchain_init(struct callchain_root *root) { INIT_LIST_HEAD(&root->node.brothers); @@ -67,9 +88,48 @@ static inline u64 cumul_hits(struct callchain_node *node) } int register_callchain_param(struct callchain_param *param); -int callchain_append(struct callchain_root *root, struct ip_callchain *chain, - struct map_symbol *syms, u64 period); -int callchain_merge(struct callchain_root *dst, struct callchain_root *src); +int callchain_append(struct callchain_root *root, + struct callchain_cursor *cursor, + u64 period); + +int callchain_merge(struct callchain_cursor *cursor, + struct callchain_root *dst, struct callchain_root *src); bool ip_callchain__valid(struct ip_callchain *chain, const event_t *event); + +/* + * Initialize a cursor before adding entries inside, but keep + * the previously allocated entries as a cache. + */ +static inline void callchain_cursor_reset(struct callchain_cursor *cursor) +{ + cursor->nr = 0; + cursor->last = &cursor->first; +} + +int callchain_cursor_append(struct callchain_cursor *cursor, u64 ip, + struct map *map, struct symbol *sym); + +/* Close a cursor writing session. Initialize for the reader */ +static inline void callchain_cursor_commit(struct callchain_cursor *cursor) +{ + cursor->curr = cursor->first; + cursor->pos = 0; +} + +/* Cursor reading iteration helpers */ +static inline struct callchain_cursor_node * +callchain_cursor_current(struct callchain_cursor *cursor) +{ + if (cursor->pos == cursor->nr) + return NULL; + + return cursor->curr; +} + +static inline void callchain_cursor_advance(struct callchain_cursor *cursor) +{ + cursor->curr = cursor->curr->next; + cursor->pos++; +} #endif /* __PERF_CALLCHAIN_H */ diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 32f4f1f2f6e4..a438a0652d23 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -211,7 +211,9 @@ void hist_entry__free(struct hist_entry *he) * collapse the histogram */ -static bool collapse__insert_entry(struct rb_root *root, struct hist_entry *he) +static bool hists__collapse_insert_entry(struct hists *self, + struct rb_root *root, + struct hist_entry *he) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; @@ -226,8 +228,11 @@ static bool collapse__insert_entry(struct rb_root *root, struct hist_entry *he) if (!cmp) { iter->period += he->period; - if (symbol_conf.use_callchain) - callchain_merge(iter->callchain, he->callchain); + if (symbol_conf.use_callchain) { + callchain_cursor_reset(&self->callchain_cursor); + callchain_merge(&self->callchain_cursor, iter->callchain, + he->callchain); + } hist_entry__free(he); return false; } @@ -262,7 +267,7 @@ void hists__collapse_resort(struct hists *self) next = rb_next(&n->rb_node); rb_erase(&n->rb_node, &self->entries); - if (collapse__insert_entry(&tmp, n)) + if (hists__collapse_insert_entry(self, &tmp, n)) hists__inc_nr_entries(self, n); } diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index ee789856a8c9..889559b86492 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -77,6 +77,8 @@ struct hists { u64 event_stream; u32 type; u16 col_len[HISTC_NR_COLS]; + /* Best would be to reuse the session callchain cursor */ + struct callchain_cursor callchain_cursor; }; struct hist_entry *__hists__add_entry(struct hists *self, diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 105f00bfd555..b58a48a5e5a9 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -242,17 +242,16 @@ static bool symbol__match_parent_regex(struct symbol *sym) return 0; } -struct map_symbol *perf_session__resolve_callchain(struct perf_session *self, - struct thread *thread, - struct ip_callchain *chain, - struct symbol **parent) +int perf_session__resolve_callchain(struct perf_session *self, + struct thread *thread, + struct ip_callchain *chain, + struct symbol **parent) { u8 cpumode = PERF_RECORD_MISC_USER; unsigned int i; - struct map_symbol *syms = calloc(chain->nr, sizeof(*syms)); + int err; - if (!syms) - return NULL; + callchain_cursor_reset(&self->callchain_cursor); for (i = 0; i < chain->nr; i++) { u64 ip = chain->ips[i]; @@ -281,12 +280,15 @@ struct map_symbol *perf_session__resolve_callchain(struct perf_session *self, *parent = al.sym; if (!symbol_conf.use_callchain) break; - syms[i].map = al.map; - syms[i].sym = al.sym; } + + err = callchain_cursor_append(&self->callchain_cursor, + ip, al.map, al.sym); + if (err) + return err; } - return syms; + return 0; } static int process_event_synth_stub(event_t *event __used, diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index decd83f274fd..e815468eb888 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -51,7 +51,8 @@ struct perf_session { int cwdlen; char *cwd; struct ordered_samples ordered_samples; - char filename[0]; + struct callchain_cursor callchain_cursor; + char filename[0]; }; struct perf_event_ops; @@ -94,10 +95,10 @@ int __perf_session__process_events(struct perf_session *self, int perf_session__process_events(struct perf_session *self, struct perf_event_ops *event_ops); -struct map_symbol *perf_session__resolve_callchain(struct perf_session *self, - struct thread *thread, - struct ip_callchain *chain, - struct symbol **parent); +int perf_session__resolve_callchain(struct perf_session *self, + struct thread *thread, + struct ip_callchain *chain, + struct symbol **parent); bool perf_session__has_traces(struct perf_session *self, const char *msg); From f08c3154ac439c4b5762a40107d84e839e08fbc5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 14 Jan 2011 04:51:59 +0100 Subject: [PATCH 027/706] perf callchain: Rename cumul_hits into callchain_cumul_hits That makes the callchain API naming more consistent and reduce potential naming clashes. Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra LKML-Reference: <1294977121-5700-3-git-send-email-fweisbec@gmail.com> Signed-off-by: Frederic Weisbecker Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/callchain.c | 10 +++++----- tools/perf/util/callchain.h | 2 +- tools/perf/util/hist.c | 2 +- tools/perf/util/ui/browsers/hists.c | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 53a49e0cfc6c..4c6360fc2d5b 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -38,14 +38,14 @@ rb_insert_callchain(struct rb_root *root, struct callchain_node *chain, struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; struct callchain_node *rnode; - u64 chain_cumul = cumul_hits(chain); + u64 chain_cumul = callchain_cumul_hits(chain); while (*p) { u64 rnode_cumul; parent = *p; rnode = rb_entry(parent, struct callchain_node, rb_node); - rnode_cumul = cumul_hits(rnode); + rnode_cumul = callchain_cumul_hits(rnode); switch (mode) { case CHAIN_FLAT: @@ -104,7 +104,7 @@ static void __sort_chain_graph_abs(struct callchain_node *node, chain_for_each_child(child, node) { __sort_chain_graph_abs(child, min_hit); - if (cumul_hits(child) >= min_hit) + if (callchain_cumul_hits(child) >= min_hit) rb_insert_callchain(&node->rb_root, child, CHAIN_GRAPH_ABS); } @@ -129,7 +129,7 @@ static void __sort_chain_graph_rel(struct callchain_node *node, chain_for_each_child(child, node) { __sort_chain_graph_rel(child, min_percent); - if (cumul_hits(child) >= min_hit) + if (callchain_cumul_hits(child) >= min_hit) rb_insert_callchain(&node->rb_root, child, CHAIN_GRAPH_REL); } @@ -270,7 +270,7 @@ split_add_child(struct callchain_node *parent, /* split the hits */ new->hit = parent->hit; new->children_hit = parent->children_hit; - parent->children_hit = cumul_hits(new); + parent->children_hit = callchain_cumul_hits(new); new->val_nr = parent->val_nr - idx_local; parent->val_nr = idx_local; diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index d74a19af4a44..07f71e3e0a71 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -82,7 +82,7 @@ static inline void callchain_init(struct callchain_root *root) root->max_depth = 0; } -static inline u64 cumul_hits(struct callchain_node *node) +static inline u64 callchain_cumul_hits(struct callchain_node *node) { return node->hit + node->children_hit; } diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index a438a0652d23..02ed318d7312 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -430,7 +430,7 @@ static size_t __callchain__fprintf_graph(FILE *fp, struct callchain_node *self, u64 cumul; child = rb_entry(node, struct callchain_node, rb_node); - cumul = cumul_hits(child); + cumul = callchain_cumul_hits(child); remaining -= cumul; /* diff --git a/tools/perf/util/ui/browsers/hists.c b/tools/perf/util/ui/browsers/hists.c index 60c463c16028..86428239fa65 100644 --- a/tools/perf/util/ui/browsers/hists.c +++ b/tools/perf/util/ui/browsers/hists.c @@ -377,7 +377,7 @@ static int hist_browser__show_callchain_node_rb_tree(struct hist_browser *self, while (node) { struct callchain_node *child = rb_entry(node, struct callchain_node, rb_node); struct rb_node *next = rb_next(node); - u64 cumul = cumul_hits(child); + u64 cumul = callchain_cumul_hits(child); struct callchain_list *chain; char folded_sign = ' '; int first = true; From 16537f1355017a285b904bfb6bf767464293e69c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 14 Jan 2011 04:52:00 +0100 Subject: [PATCH 028/706] perf callchain: Rename register_callchain_param into callchain_register_param To make the callchain API naming more consistent. Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra LKML-Reference: <1294977121-5700-4-git-send-email-fweisbec@gmail.com> Signed-off-by: Frederic Weisbecker Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 4 ++-- tools/perf/util/callchain.c | 2 +- tools/perf/util/callchain.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index c95599a82f9e..f6a43493d1d0 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -221,7 +221,7 @@ static int perf_session__setup_sample_type(struct perf_session *self) } else if (!dont_use_callchains && callchain_param.mode != CHAIN_NONE && !symbol_conf.use_callchain) { symbol_conf.use_callchain = true; - if (register_callchain_param(&callchain_param) < 0) { + if (callchain_register_param(&callchain_param) < 0) { fprintf(stderr, "Can't register callchain" " params\n"); return -EINVAL; @@ -423,7 +423,7 @@ parse_callchain_opt(const struct option *opt __used, const char *arg, if (tok2) callchain_param.print_limit = strtod(tok2, &endptr); setup: - if (register_callchain_param(&callchain_param) < 0) { + if (callchain_register_param(&callchain_param) < 0) { fprintf(stderr, "Can't register callchain params\n"); return -1; } diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 4c6360fc2d5b..5b3f09dd137d 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -143,7 +143,7 @@ sort_chain_graph_rel(struct rb_root *rb_root, struct callchain_root *chain_root, rb_root->rb_node = chain_root->node.rb_root.rb_node; } -int register_callchain_param(struct callchain_param *param) +int callchain_register_param(struct callchain_param *param) { switch (param->mode) { case CHAIN_GRAPH_ABS: diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index 07f71e3e0a71..2bb5403010d2 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -87,7 +87,7 @@ static inline u64 callchain_cumul_hits(struct callchain_node *node) return node->hit + node->children_hit; } -int register_callchain_param(struct callchain_param *param); +int callchain_register_param(struct callchain_param *param); int callchain_append(struct callchain_root *root, struct callchain_cursor *cursor, u64 period); From 529363b76929beb85b81439c61063130af046a21 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 14 Jan 2011 04:52:01 +0100 Subject: [PATCH 029/706] perf callchain: Don't give arbitrary gender to callchain tree nodes Some little callchain tree nodes shyly asked me if they can have sisters. How cute! Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra LKML-Reference: <1294977121-5700-5-git-send-email-fweisbec@gmail.com> Signed-off-by: Frederic Weisbecker Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/callchain.c | 8 ++++---- tools/perf/util/callchain.h | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 5b3f09dd137d..f8c66d1435e0 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -26,10 +26,10 @@ bool ip_callchain__valid(struct ip_callchain *chain, const event_t *event) } #define chain_for_each_child(child, parent) \ - list_for_each_entry(child, &parent->children, brothers) + list_for_each_entry(child, &parent->children, siblings) #define chain_for_each_child_safe(child, next, parent) \ - list_for_each_entry_safe(child, next, &parent->children, brothers) + list_for_each_entry_safe(child, next, &parent->children, siblings) static void rb_insert_callchain(struct rb_root *root, struct callchain_node *chain, @@ -189,7 +189,7 @@ create_child(struct callchain_node *parent, bool inherit_children) chain_for_each_child(next, new) next->parent = new; } - list_add_tail(&new->brothers, &parent->children); + list_add_tail(&new->siblings, &parent->children); return new; } @@ -419,7 +419,7 @@ merge_chain_branch(struct callchain_cursor *cursor, if (err) break; - list_del(&child->brothers); + list_del(&child->siblings); free(child); } diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index 2bb5403010d2..67137256a1cd 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -16,7 +16,7 @@ enum chain_mode { struct callchain_node { struct callchain_node *parent; - struct list_head brothers; + struct list_head siblings; struct list_head children; struct list_head val; struct rb_node rb_node; /* to sort nodes in an rbtree */ @@ -72,7 +72,7 @@ struct callchain_cursor { static inline void callchain_init(struct callchain_root *root) { - INIT_LIST_HEAD(&root->node.brothers); + INIT_LIST_HEAD(&root->node.siblings); INIT_LIST_HEAD(&root->node.children); INIT_LIST_HEAD(&root->node.val); From b0e8572f3b29c0760b66ba5627a6d5426c88c97d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 16 Jan 2011 17:39:15 -0200 Subject: [PATCH 030/706] perf top: Add native_safe_halt to skip symbols Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-top.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 58352ad807c7..31fbaf38d9c1 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -933,6 +933,7 @@ repeat: /* Tag samples to be skipped. */ static const char *skip_symbols[] = { "default_idle", + "native_safe_halt", "cpu_idle", "enter_idle", "exit_idle", From 4cc9cec636e7f78aba7f17606ac13cac07ea5787 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 13 Jan 2011 21:45:58 +0900 Subject: [PATCH 031/706] perf probe: Introduce lines walker interface Introduce die_walk_lines() for walking on the line list of given die, and use it in line_range finder and probe point finder. Cc: 2nddept-manager@sdl.hitachi.co.jp Cc: Franck Bui-Huu Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Steven Rostedt LKML-Reference: <20110113124558.22426.48170.stgit@ltc236.sdl.hitachi.co.jp> Signed-off-by: Masami Hiramatsu [ committer note: s/%ld/%zd/ for a size_t nlines var that broke f14 x86 build] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-finder.c | 325 +++++++++++++++++---------------- 1 file changed, 169 insertions(+), 156 deletions(-) diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index ab83b6ac5d65..508c017f566a 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -458,6 +458,124 @@ static Dwarf_Die *die_find_inlinefunc(Dwarf_Die *sp_die, Dwarf_Addr addr, return die_find_child(sp_die, __die_find_inline_cb, &addr, die_mem); } +/* Walker on lines (Note: line number will not be sorted) */ +typedef int (* line_walk_handler_t) (const char *fname, int lineno, + Dwarf_Addr addr, void *data); + +struct __line_walk_param { + line_walk_handler_t handler; + void *data; + int retval; +}; + +/* Walk on decl lines in given DIE */ +static int __die_walk_funclines(Dwarf_Die *sp_die, + line_walk_handler_t handler, void *data) +{ + const char *fname; + Dwarf_Addr addr; + int lineno, ret = 0; + + /* Handle function declaration line */ + fname = dwarf_decl_file(sp_die); + if (fname && dwarf_decl_line(sp_die, &lineno) == 0 && + dwarf_entrypc(sp_die, &addr) == 0) { + ret = handler(fname, lineno, addr, data); + } + + return ret; +} + +static int __die_walk_culines_cb(Dwarf_Die *sp_die, void *data) +{ + struct __line_walk_param *lw = data; + + lw->retval = __die_walk_funclines(sp_die, lw->handler, lw->data); + if (lw->retval != 0) + return DWARF_CB_ABORT; + + return DWARF_CB_OK; +} + +/* + * Walk on lines inside given PDIE. If the PDIE is subprogram, walk only on + * the lines inside the subprogram, otherwise PDIE must be a CU DIE. + */ +static int die_walk_lines(Dwarf_Die *pdie, line_walk_handler_t handler, + void *data) +{ + Dwarf_Lines *lines; + Dwarf_Line *line; + Dwarf_Addr addr; + const char *fname; + int lineno, ret = 0; + Dwarf_Die die_mem, *cu_die; + size_t nlines, i; + + /* Get the CU die */ + if (dwarf_tag(pdie) == DW_TAG_subprogram) + cu_die = dwarf_diecu(pdie, &die_mem, NULL, NULL); + else + cu_die = pdie; + if (!cu_die) { + pr_debug2("Failed to get CU from subprogram\n"); + return -EINVAL; + } + + /* Get lines list in the CU */ + if (dwarf_getsrclines(cu_die, &lines, &nlines) != 0) { + pr_debug2("Failed to get source lines on this CU.\n"); + return -ENOENT; + } + pr_debug2("Get %zd lines from this CU\n", nlines); + + /* Walk on the lines on lines list */ + for (i = 0; i < nlines; i++) { + line = dwarf_onesrcline(lines, i); + if (line == NULL || + dwarf_lineno(line, &lineno) != 0 || + dwarf_lineaddr(line, &addr) != 0) { + pr_debug2("Failed to get line info. " + "Possible error in debuginfo.\n"); + continue; + } + /* Filter lines based on address */ + if (pdie != cu_die) + /* + * Address filtering + * The line is included in given function, and + * no inline block includes it. + */ + if (!dwarf_haspc(pdie, addr) || + die_find_inlinefunc(pdie, addr, &die_mem)) + continue; + /* Get source line */ + fname = dwarf_linesrc(line, NULL, NULL); + + ret = handler(fname, lineno, addr, data); + if (ret != 0) + return ret; + } + + /* + * Dwarf lines doesn't include function declarations and inlined + * subroutines. We have to check functions list or given function. + */ + if (pdie != cu_die) + ret = __die_walk_funclines(pdie, handler, data); + else { + struct __line_walk_param param = { + .handler = handler, + .data = data, + .retval = 0, + }; + dwarf_getfuncs(cu_die, __die_walk_culines_cb, ¶m, 0); + ret = param.retval; + } + + return ret; +} + struct __find_variable_param { const char *name; Dwarf_Addr addr; @@ -1050,43 +1168,26 @@ static int call_probe_finder(Dwarf_Die *sp_die, struct probe_finder *pf) return ret; } +static int probe_point_line_walker(const char *fname, int lineno, + Dwarf_Addr addr, void *data) +{ + struct probe_finder *pf = data; + int ret; + + if (lineno != pf->lno || strtailcmp(fname, pf->fname) != 0) + return 0; + + pf->addr = addr; + ret = call_probe_finder(NULL, pf); + + /* Continue if no error, because the line will be in inline function */ + return ret < 0 ?: 0; +} + /* Find probe point from its line number */ static int find_probe_point_by_line(struct probe_finder *pf) { - Dwarf_Lines *lines; - Dwarf_Line *line; - size_t nlines, i; - Dwarf_Addr addr; - int lineno; - int ret = 0; - - if (dwarf_getsrclines(&pf->cu_die, &lines, &nlines) != 0) { - pr_warning("No source lines found.\n"); - return -ENOENT; - } - - for (i = 0; i < nlines && ret == 0; i++) { - line = dwarf_onesrcline(lines, i); - if (dwarf_lineno(line, &lineno) != 0 || - lineno != pf->lno) - continue; - - /* TODO: Get fileno from line, but how? */ - if (strtailcmp(dwarf_linesrc(line, NULL, NULL), pf->fname) != 0) - continue; - - if (dwarf_lineaddr(line, &addr) != 0) { - pr_warning("Failed to get the address of the line.\n"); - return -ENOENT; - } - pr_debug("Probe line found: line[%d]:%d addr:0x%jx\n", - (int)i, lineno, (uintmax_t)addr); - pf->addr = addr; - - ret = call_probe_finder(NULL, pf); - /* Continuing, because target line might be inlined. */ - } - return ret; + return die_walk_lines(&pf->cu_die, probe_point_line_walker, pf); } /* Find lines which match lazy pattern */ @@ -1140,15 +1241,31 @@ out_close: return nlines; } +static int probe_point_lazy_walker(const char *fname, int lineno, + Dwarf_Addr addr, void *data) +{ + struct probe_finder *pf = data; + int ret; + + if (!line_list__has_line(&pf->lcache, lineno) || + strtailcmp(fname, pf->fname) != 0) + return 0; + + pr_debug("Probe line found: line:%d addr:0x%llx\n", + lineno, (unsigned long long)addr); + pf->addr = addr; + ret = call_probe_finder(NULL, pf); + + /* + * Continue if no error, because the lazy pattern will match + * to other lines + */ + return ret < 0 ?: 0; +} + /* Find probe points from lazy pattern */ static int find_probe_point_lazy(Dwarf_Die *sp_die, struct probe_finder *pf) { - Dwarf_Lines *lines; - Dwarf_Line *line; - size_t nlines, i; - Dwarf_Addr addr; - Dwarf_Die die_mem; - int lineno; int ret = 0; if (list_empty(&pf->lcache)) { @@ -1162,45 +1279,7 @@ static int find_probe_point_lazy(Dwarf_Die *sp_die, struct probe_finder *pf) return ret; } - if (dwarf_getsrclines(&pf->cu_die, &lines, &nlines) != 0) { - pr_warning("No source lines found.\n"); - return -ENOENT; - } - - for (i = 0; i < nlines && ret >= 0; i++) { - line = dwarf_onesrcline(lines, i); - - if (dwarf_lineno(line, &lineno) != 0 || - !line_list__has_line(&pf->lcache, lineno)) - continue; - - /* TODO: Get fileno from line, but how? */ - if (strtailcmp(dwarf_linesrc(line, NULL, NULL), pf->fname) != 0) - continue; - - if (dwarf_lineaddr(line, &addr) != 0) { - pr_debug("Failed to get the address of line %d.\n", - lineno); - continue; - } - if (sp_die) { - /* Address filtering 1: does sp_die include addr? */ - if (!dwarf_haspc(sp_die, addr)) - continue; - /* Address filtering 2: No child include addr? */ - if (die_find_inlinefunc(sp_die, addr, &die_mem)) - continue; - } - - pr_debug("Probe line found: line[%d]:%d addr:0x%llx\n", - (int)i, lineno, (unsigned long long)addr); - pf->addr = addr; - - ret = call_probe_finder(sp_die, pf); - /* Continuing, because target line might be inlined. */ - } - /* TODO: deallocate lines, but how? */ - return ret; + return die_walk_lines(sp_die, probe_point_lazy_walker, pf); } /* Callback parameter with return value */ @@ -1644,91 +1723,28 @@ static int line_range_add_line(const char *src, unsigned int lineno, return line_list__add_line(&lr->line_list, lineno); } -/* Search function declaration lines */ -static int line_range_funcdecl_cb(Dwarf_Die *sp_die, void *data) +static int line_range_walk_cb(const char *fname, int lineno, + Dwarf_Addr addr __used, + void *data) { - struct dwarf_callback_param *param = data; - struct line_finder *lf = param->data; - const char *src; - int lineno; + struct line_finder *lf = data; - src = dwarf_decl_file(sp_die); - if (src && strtailcmp(src, lf->fname) != 0) - return DWARF_CB_OK; - - if (dwarf_decl_line(sp_die, &lineno) != 0 || + if ((strtailcmp(fname, lf->fname) != 0) || (lf->lno_s > lineno || lf->lno_e < lineno)) - return DWARF_CB_OK; + return 0; - param->retval = line_range_add_line(src, lineno, lf->lr); - if (param->retval < 0) - return DWARF_CB_ABORT; - return DWARF_CB_OK; -} + if (line_range_add_line(fname, lineno, lf->lr) < 0) + return -EINVAL; -static int find_line_range_func_decl_lines(struct line_finder *lf) -{ - struct dwarf_callback_param param = {.data = (void *)lf, .retval = 0}; - dwarf_getfuncs(&lf->cu_die, line_range_funcdecl_cb, ¶m, 0); - return param.retval; + return 0; } /* Find line range from its line number */ static int find_line_range_by_line(Dwarf_Die *sp_die, struct line_finder *lf) { - Dwarf_Lines *lines; - Dwarf_Line *line; - size_t nlines, i; - Dwarf_Addr addr; - int lineno, ret = 0; - const char *src; - Dwarf_Die die_mem; + int ret; - line_list__init(&lf->lr->line_list); - if (dwarf_getsrclines(&lf->cu_die, &lines, &nlines) != 0) { - pr_warning("No source lines found.\n"); - return -ENOENT; - } - - /* Search probable lines on lines list */ - for (i = 0; i < nlines; i++) { - line = dwarf_onesrcline(lines, i); - if (dwarf_lineno(line, &lineno) != 0 || - (lf->lno_s > lineno || lf->lno_e < lineno)) - continue; - - if (sp_die) { - /* Address filtering 1: does sp_die include addr? */ - if (dwarf_lineaddr(line, &addr) != 0 || - !dwarf_haspc(sp_die, addr)) - continue; - - /* Address filtering 2: No child include addr? */ - if (die_find_inlinefunc(sp_die, addr, &die_mem)) - continue; - } - - /* TODO: Get fileno from line, but how? */ - src = dwarf_linesrc(line, NULL, NULL); - if (strtailcmp(src, lf->fname) != 0) - continue; - - ret = line_range_add_line(src, lineno, lf->lr); - if (ret < 0) - return ret; - } - - /* - * Dwarf lines doesn't include function declarations. We have to - * check functions list or given function. - */ - if (sp_die) { - src = dwarf_decl_file(sp_die); - if (src && dwarf_decl_line(sp_die, &lineno) == 0 && - (lf->lno_s <= lineno && lf->lno_e >= lineno)) - ret = line_range_add_line(src, lineno, lf->lr); - } else - ret = find_line_range_func_decl_lines(lf); + ret = die_walk_lines(sp_die ?: &lf->cu_die, line_range_walk_cb, lf); /* Update status */ if (ret >= 0) @@ -1758,9 +1774,6 @@ static int line_range_search_cb(Dwarf_Die *sp_die, void *data) struct line_finder *lf = param->data; struct line_range *lr = lf->lr; - pr_debug("find (%llx) %s\n", - (unsigned long long)dwarf_dieoffset(sp_die), - dwarf_diename(sp_die)); if (dwarf_tag(sp_die) == DW_TAG_subprogram && die_compare_name(sp_die, lr->function)) { lf->fname = dwarf_decl_file(sp_die); From 5069ed86be3c2f28bcdf7fae1374ec0c325aafba Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 13 Jan 2011 21:46:05 +0900 Subject: [PATCH 032/706] perf probe: Enable to put probe inline function call site Enable to put probe inline function call site. This will increase line-based probe-ability. $ ./perf probe -L schedule:48 pre_schedule(rq, prev); 50 if (unlikely(!rq->nr_running)) idle_balance(cpu, rq); put_prev_task(rq, prev); next = pick_next_task(rq); 56 if (likely(prev != next)) { sched_info_switch(prev, next); trace_sched_switch_out(prev, next); perf_event_task_sched_out(prev, next); $ ./perf probe -L schedule:48 48 pre_schedule(rq, prev); 50 if (unlikely(!rq->nr_running)) 51 idle_balance(cpu, rq); 53 put_prev_task(rq, prev); 54 next = pick_next_task(rq); 56 if (likely(prev != next)) { 57 sched_info_switch(prev, next); 58 trace_sched_switch_out(prev, next); 59 perf_event_task_sched_out(prev, next); Cc: 2nddept-manager@sdl.hitachi.co.jp Cc: Franck Bui-Huu Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Steven Rostedt LKML-Reference: <20110113124604.22426.48873.stgit@ltc236.sdl.hitachi.co.jp> Signed-off-by: Masami Hiramatsu Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-finder.c | 56 +++++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 8 deletions(-) diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index 508c017f566a..69215bff17e9 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -280,6 +280,19 @@ static bool die_compare_name(Dwarf_Die *dw_die, const char *tname) return name ? (strcmp(tname, name) == 0) : false; } +/* Get callsite line number of inline-function instance */ +static int die_get_call_lineno(Dwarf_Die *in_die) +{ + Dwarf_Attribute attr; + Dwarf_Word ret; + + if (!dwarf_attr(in_die, DW_AT_call_line, &attr)) + return -ENOENT; + + dwarf_formudata(&attr, &ret); + return (int)ret; +} + /* Get type die */ static Dwarf_Die *die_get_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem) { @@ -463,27 +476,54 @@ typedef int (* line_walk_handler_t) (const char *fname, int lineno, Dwarf_Addr addr, void *data); struct __line_walk_param { + const char *fname; line_walk_handler_t handler; void *data; int retval; }; -/* Walk on decl lines in given DIE */ +static int __die_walk_funclines_cb(Dwarf_Die *in_die, void *data) +{ + struct __line_walk_param *lw = data; + Dwarf_Addr addr; + int lineno; + + if (dwarf_tag(in_die) == DW_TAG_inlined_subroutine) { + lineno = die_get_call_lineno(in_die); + if (lineno > 0 && dwarf_entrypc(in_die, &addr) == 0) { + lw->retval = lw->handler(lw->fname, lineno, addr, + lw->data); + if (lw->retval != 0) + return DIE_FIND_CB_FOUND; + } + } + return DIE_FIND_CB_SIBLING; +} + +/* Walk on lines of blocks included in given DIE */ static int __die_walk_funclines(Dwarf_Die *sp_die, line_walk_handler_t handler, void *data) { - const char *fname; + struct __line_walk_param lw = { + .handler = handler, + .data = data, + .retval = 0, + }; + Dwarf_Die die_mem; Dwarf_Addr addr; - int lineno, ret = 0; + int lineno; /* Handle function declaration line */ - fname = dwarf_decl_file(sp_die); - if (fname && dwarf_decl_line(sp_die, &lineno) == 0 && + lw.fname = dwarf_decl_file(sp_die); + if (lw.fname && dwarf_decl_line(sp_die, &lineno) == 0 && dwarf_entrypc(sp_die, &addr) == 0) { - ret = handler(fname, lineno, addr, data); + lw.retval = handler(lw.fname, lineno, addr, data); + if (lw.retval != 0) + goto done; } - - return ret; + die_find_child(sp_die, __die_walk_funclines_cb, &lw, &die_mem); +done: + return lw.retval; } static int __die_walk_culines_cb(Dwarf_Die *sp_die, void *data) From e80711ca8512c8586da0c3e18e2f1caf73c88731 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 13 Jan 2011 21:46:11 +0900 Subject: [PATCH 033/706] perf probe: Add --funcs to show available functions in symtab Add --funcs to show available functions in symtab. Originally this feature came from Srikar's uprobes patches ( http://lkml.org/lkml/2010/8/27/244 ) e.g. ... __ablkcipher_walk_complete __absent_pages_in_range __account_scheduler_latency __add_pages __alloc_pages_nodemask __alloc_percpu __alloc_reserved_percpu __alloc_skb __alloc_workqueue_key __any_online_cpu __ata_ehi_push_desc ... This also supports symbols in module, e.g. ... cleanup_module cpuid_maxphyaddr emulate_clts emulate_instruction emulate_int_real emulate_invlpg emulator_get_dr emulator_set_dr emulator_task_switch emulator_write_emulated emulator_write_phys fx_init ... Original-patch-from: Srikar Dronamraju Cc: 2nddept-manager@sdl.hitachi.co.jp Cc: Franck Bui-Huu Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Steven Rostedt LKML-Reference: <20110113124611.22426.10835.stgit@ltc236.sdl.hitachi.co.jp> Signed-off-by: Masami Hiramatsu [ committer note: Add missing elf.h for STB_GLOBAL that broke a RHEL4 build ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-probe.txt | 4 ++ tools/perf/builtin-probe.c | 29 ++++++++++- tools/perf/util/probe-event.c | 68 ++++++++++++++++++++++++- tools/perf/util/probe-event.h | 1 + 4 files changed, 99 insertions(+), 3 deletions(-) diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt index 86b797a35aa6..fcc51fe0195c 100644 --- a/tools/perf/Documentation/perf-probe.txt +++ b/tools/perf/Documentation/perf-probe.txt @@ -73,6 +73,10 @@ OPTIONS (Only for --vars) Show external defined variables in addition to local variables. +-F:: +--funcs:: + Show available functions in given module or kernel. + -f:: --force:: Forcibly add events with existing name. diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c index add163c9f0e7..6cf708aba7c9 100644 --- a/tools/perf/builtin-probe.c +++ b/tools/perf/builtin-probe.c @@ -52,6 +52,7 @@ static struct { bool show_lines; bool show_vars; bool show_ext_vars; + bool show_funcs; bool mod_events; int nevents; struct perf_probe_event events[MAX_PROBES]; @@ -221,6 +222,8 @@ static const struct option options[] = { OPT__DRY_RUN(&probe_event_dry_run), OPT_INTEGER('\0', "max-probes", ¶ms.max_probe_points, "Set how many probe points can be found for a probe."), + OPT_BOOLEAN('F', "funcs", ¶ms.show_funcs, + "Show potential probe-able functions."), OPT_END() }; @@ -246,7 +249,7 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used) params.max_probe_points = MAX_PROBES; if ((!params.nevents && !params.dellist && !params.list_events && - !params.show_lines)) + !params.show_lines && !params.show_funcs)) usage_with_options(probe_usage, options); /* @@ -267,12 +270,36 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used) pr_err(" Error: Don't use --list with --vars.\n"); usage_with_options(probe_usage, options); } + if (params.show_funcs) { + pr_err(" Error: Don't use --list with --funcs.\n"); + usage_with_options(probe_usage, options); + } ret = show_perf_probe_events(); if (ret < 0) pr_err(" Error: Failed to show event list. (%d)\n", ret); return ret; } + if (params.show_funcs) { + if (params.nevents != 0 || params.dellist) { + pr_err(" Error: Don't use --funcs with" + " --add/--del.\n"); + usage_with_options(probe_usage, options); + } + if (params.show_lines) { + pr_err(" Error: Don't use --funcs with --line.\n"); + usage_with_options(probe_usage, options); + } + if (params.show_vars) { + pr_err(" Error: Don't use --funcs with --vars.\n"); + usage_with_options(probe_usage, options); + } + ret = show_available_funcs(params.target_module); + if (ret < 0) + pr_err(" Error: Failed to show functions." + " (%d)\n", ret); + return ret; + } #ifdef DWARF_SUPPORT if (params.show_lines) { diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 6e29d9c9dccc..859d377a3df3 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -31,6 +31,7 @@ #include #include #include +#include #undef _GNU_SOURCE #include "util.h" @@ -111,7 +112,25 @@ static struct symbol *__find_kernel_function_by_name(const char *name, NULL); } -const char *kernel_get_module_path(const char *module) +static struct map *kernel_get_module_map(const char *module) +{ + struct rb_node *nd; + struct map_groups *grp = &machine.kmaps; + + if (!module) + module = "kernel"; + + for (nd = rb_first(&grp->maps[MAP__FUNCTION]); nd; nd = rb_next(nd)) { + struct map *pos = rb_entry(nd, struct map, rb_node); + if (strncmp(pos->dso->short_name + 1, module, + pos->dso->short_name_len - 2) == 0) { + return pos; + } + } + return NULL; +} + +static struct dso *kernel_get_module_dso(const char *module) { struct dso *dso; struct map *map; @@ -141,7 +160,13 @@ const char *kernel_get_module_path(const char *module) } } found: - return dso->long_name; + return dso; +} + +const char *kernel_get_module_path(const char *module) +{ + struct dso *dso = kernel_get_module_dso(module); + return (dso) ? dso->long_name : NULL; } #ifdef DWARF_SUPPORT @@ -1913,3 +1938,42 @@ int del_perf_probe_events(struct strlist *dellist) return ret; } +/* + * If a symbol corresponds to a function with global binding return 0. + * For all others return 1. + */ +static int filter_non_global_functions(struct map *map __unused, + struct symbol *sym) +{ + if (sym->binding != STB_GLOBAL) + return 1; + + return 0; +} + +int show_available_funcs(const char *module) +{ + struct map *map; + int ret; + + setup_pager(); + + ret = init_vmlinux(); + if (ret < 0) + return ret; + + map = kernel_get_module_map(module); + if (!map) { + pr_err("Failed to find %s map.\n", (module) ? : "kernel"); + return -EINVAL; + } + if (map__load(map, filter_non_global_functions)) { + pr_err("Failed to load map.\n"); + return -EINVAL; + } + if (!dso__sorted_by_name(map->dso, map->type)) + dso__sort_by_name(map->dso, map->type); + + dso__fprintf_symbols_by_name(map->dso, map->type, stdout); + return 0; +} diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h index 5accbedfea37..1fb4f18337d3 100644 --- a/tools/perf/util/probe-event.h +++ b/tools/perf/util/probe-event.h @@ -127,6 +127,7 @@ extern int show_line_range(struct line_range *lr, const char *module); extern int show_available_vars(struct perf_probe_event *pevs, int npevs, int max_probe_points, const char *module, bool externs); +extern int show_available_funcs(const char *module); /* Maximum index number of event-name postfix */ From d7065adb9b4f3384c2615f0a3dbdb6c3aae1eb18 Mon Sep 17 00:00:00 2001 From: Franck Bui-Huu Date: Sun, 16 Jan 2011 17:14:45 +0100 Subject: [PATCH 034/706] perf record: auto detect when stdout is a pipe This patch gives the ability to 'perf record' to detect when its stdout has been redirected to a pipe. There's now no more need to add '-o -' switch in this case. However '-o ' option has always precedence, that is if specified and stdout has been connected via a pipe then the output will go into the specified output. LKML-Reference: Signed-off-by: Franck Bui-Huu Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 45a3689f9ed6..1346d4230bc0 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -48,7 +48,7 @@ static unsigned int user_freq = UINT_MAX; static int freq = 1000; static int output; static int pipe_output = 0; -static const char *output_name = "perf.data"; +static const char *output_name = NULL; static int group = 0; static int realtime_prio = 0; static bool nodelay = false; @@ -497,18 +497,26 @@ static int __cmd_record(int argc, const char **argv) exit(-1); } - if (!strcmp(output_name, "-")) - pipe_output = 1; - else if (!stat(output_name, &st) && st.st_size) { - if (write_mode == WRITE_FORCE) { - char oldname[PATH_MAX]; - snprintf(oldname, sizeof(oldname), "%s.old", - output_name); - unlink(oldname); - rename(output_name, oldname); + if (!output_name) { + if (!fstat(STDOUT_FILENO, &st) && S_ISFIFO(st.st_mode)) + pipe_output = 1; + else + output_name = "perf.data"; + } + if (output_name) { + if (!strcmp(output_name, "-")) + pipe_output = 1; + else if (!stat(output_name, &st) && st.st_size) { + if (write_mode == WRITE_FORCE) { + char oldname[PATH_MAX]; + snprintf(oldname, sizeof(oldname), "%s.old", + output_name); + unlink(oldname); + rename(output_name, oldname); + } + } else if (write_mode == WRITE_APPEND) { + write_mode = WRITE_FORCE; } - } else if (write_mode == WRITE_APPEND) { - write_mode = WRITE_FORCE; } flags = O_CREAT|O_RDWR; From 17ea1b70a87e28457821318341bead2b45563092 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 17 Jan 2011 14:40:46 -0200 Subject: [PATCH 035/706] perf tools: Pass the struct opt to the wildcard parsing routine It is needed because it will call parse_event for each tracepoint name that matches, and we pass the perf_evlist via opt->value. Problem introduced in 4503fdd where my assumption about opt being always non NULL made me not look at callers of parse_events outside builtin-*.c. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index d3086cecd2dd..cf082daa43e3 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -446,8 +446,8 @@ parse_single_tracepoint_event(char *sys_name, /* sys + ':' + event + ':' + flags*/ #define MAX_EVOPT_LEN (MAX_EVENT_LENGTH * 2 + 2 + 128) static enum event_result -parse_multiple_tracepoint_event(char *sys_name, const char *evt_exp, - char *flags) +parse_multiple_tracepoint_event(const struct option *opt, char *sys_name, + const char *evt_exp, char *flags) { char evt_path[MAXPATHLEN]; struct dirent *evt_ent; @@ -480,15 +480,16 @@ parse_multiple_tracepoint_event(char *sys_name, const char *evt_exp, if (len < 0) return EVT_FAILED; - if (parse_events(NULL, event_opt, 0)) + if (parse_events(opt, event_opt, 0)) return EVT_FAILED; } return EVT_HANDLED_ALL; } -static enum event_result parse_tracepoint_event(const char **strp, - struct perf_event_attr *attr) +static enum event_result +parse_tracepoint_event(const struct option *opt, const char **strp, + struct perf_event_attr *attr) { const char *evt_name; char *flags = NULL, *comma_loc; @@ -527,7 +528,7 @@ static enum event_result parse_tracepoint_event(const char **strp, return EVT_FAILED; if (strpbrk(evt_name, "*?")) { *strp += strlen(sys_name) + evt_length + 1; /* 1 == the ':' */ - return parse_multiple_tracepoint_event(sys_name, evt_name, + return parse_multiple_tracepoint_event(opt, sys_name, evt_name, flags); } else { return parse_single_tracepoint_event(sys_name, evt_name, @@ -737,11 +738,12 @@ parse_event_modifier(const char **strp, struct perf_event_attr *attr) * Symbolic names are (almost) exactly matched. */ static enum event_result -parse_event_symbols(const char **str, struct perf_event_attr *attr) +parse_event_symbols(const struct option *opt, const char **str, + struct perf_event_attr *attr) { enum event_result ret; - ret = parse_tracepoint_event(str, attr); + ret = parse_tracepoint_event(opt, str, attr); if (ret != EVT_FAILED) goto modifier; @@ -783,7 +785,7 @@ int parse_events(const struct option *opt, const char *str, int unset __used) for (;;) { memset(&attr, 0, sizeof(attr)); - ret = parse_event_symbols(&str, &attr); + ret = parse_event_symbols(opt, &str, &attr); if (ret == EVT_FAILED) return -1; From fd78260b5376173faeb17127bd63b3c99a8e8bfb Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 18 Jan 2011 15:15:24 -0200 Subject: [PATCH 036/706] perf threads: Move thread_map to separate file To untangle it from struct thread handling, that is tied to symbols, etc. Right now in the python bindings I'm working on I need just a subset of the util/ files, untangling it allows me to do that. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile | 2 ++ tools/perf/builtin-record.c | 1 + tools/perf/builtin-stat.c | 1 + tools/perf/builtin-test.c | 2 +- tools/perf/builtin-top.c | 1 + tools/perf/util/evsel.c | 2 +- tools/perf/util/thread.c | 55 ------------------------------- tools/perf/util/thread.h | 14 -------- tools/perf/util/thread_map.c | 64 ++++++++++++++++++++++++++++++++++++ tools/perf/util/thread_map.h | 15 +++++++++ 10 files changed, 86 insertions(+), 71 deletions(-) create mode 100644 tools/perf/util/thread_map.c create mode 100644 tools/perf/util/thread_map.h diff --git a/tools/perf/Makefile b/tools/perf/Makefile index f20bc6f85611..638e8e146bb9 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -426,6 +426,7 @@ LIB_H += util/values.h LIB_H += util/sort.h LIB_H += util/hist.h LIB_H += util/thread.h +LIB_H += util/thread_map.h LIB_H += util/trace-event.h LIB_H += util/probe-finder.h LIB_H += util/probe-event.h @@ -471,6 +472,7 @@ LIB_OBJS += $(OUTPUT)util/map.o LIB_OBJS += $(OUTPUT)util/pstack.o LIB_OBJS += $(OUTPUT)util/session.o LIB_OBJS += $(OUTPUT)util/thread.o +LIB_OBJS += $(OUTPUT)util/thread_map.o LIB_OBJS += $(OUTPUT)util/trace-event-parse.o LIB_OBJS += $(OUTPUT)util/trace-event-read.o LIB_OBJS += $(OUTPUT)util/trace-event-info.o diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 1346d4230bc0..d7886307f6f4 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -24,6 +24,7 @@ #include "util/session.h" #include "util/symbol.h" #include "util/cpumap.h" +#include "util/thread_map.h" #include #include diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index e2a2d02c5c43..8906adfdbd8e 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -49,6 +49,7 @@ #include "util/header.h" #include "util/cpumap.h" #include "util/thread.h" +#include "util/thread_map.h" #include #include diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c index 4fd34537c01d..dc91ee06a37c 100644 --- a/tools/perf/builtin-test.c +++ b/tools/perf/builtin-test.c @@ -12,7 +12,7 @@ #include "util/parse-events.h" #include "util/session.h" #include "util/symbol.h" -#include "util/thread.h" +#include "util/thread_map.h" static long page_size; diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 31fbaf38d9c1..d0b16d905405 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -26,6 +26,7 @@ #include "util/session.h" #include "util/symbol.h" #include "util/thread.h" +#include "util/thread_map.h" #include "util/util.h" #include #include "util/parse-options.h" diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index ee490356c817..9a6d94299ab8 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -3,7 +3,7 @@ #include "../perf.h" #include "util.h" #include "cpumap.h" -#include "thread.h" +#include "thread_map.h" #include #include diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index 00f4eade2e3e..d5d3b22250f3 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -7,61 +7,6 @@ #include "util.h" #include "debug.h" -/* Skip "." and ".." directories */ -static int filter(const struct dirent *dir) -{ - if (dir->d_name[0] == '.') - return 0; - else - return 1; -} - -struct thread_map *thread_map__new_by_pid(pid_t pid) -{ - struct thread_map *threads; - char name[256]; - int items; - struct dirent **namelist = NULL; - int i; - - sprintf(name, "/proc/%d/task", pid); - items = scandir(name, &namelist, filter, NULL); - if (items <= 0) - return NULL; - - threads = malloc(sizeof(*threads) + sizeof(pid_t) * items); - if (threads != NULL) { - for (i = 0; i < items; i++) - threads->map[i] = atoi(namelist[i]->d_name); - threads->nr = items; - } - - for (i=0; imap[0] = tid; - threads->nr = 1; - } - - return threads; -} - -struct thread_map *thread_map__new(pid_t pid, pid_t tid) -{ - if (pid != -1) - return thread_map__new_by_pid(pid); - return thread_map__new_by_tid(tid); -} - static struct thread *thread__new(pid_t pid) { struct thread *self = zalloc(sizeof(*self)); diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h index d7574101054a..e5f2401c1b5e 100644 --- a/tools/perf/util/thread.h +++ b/tools/perf/util/thread.h @@ -18,24 +18,10 @@ struct thread { int comm_len; }; -struct thread_map { - int nr; - int map[]; -}; - struct perf_session; void thread__delete(struct thread *self); -struct thread_map *thread_map__new_by_pid(pid_t pid); -struct thread_map *thread_map__new_by_tid(pid_t tid); -struct thread_map *thread_map__new(pid_t pid, pid_t tid); - -static inline void thread_map__delete(struct thread_map *threads) -{ - free(threads); -} - int thread__set_comm(struct thread *self, const char *comm); int thread__comm_len(struct thread *self); struct thread *perf_session__findnew(struct perf_session *self, pid_t pid); diff --git a/tools/perf/util/thread_map.c b/tools/perf/util/thread_map.c new file mode 100644 index 000000000000..a5df131b77c3 --- /dev/null +++ b/tools/perf/util/thread_map.c @@ -0,0 +1,64 @@ +#include +#include +#include +#include "thread_map.h" + +/* Skip "." and ".." directories */ +static int filter(const struct dirent *dir) +{ + if (dir->d_name[0] == '.') + return 0; + else + return 1; +} + +struct thread_map *thread_map__new_by_pid(pid_t pid) +{ + struct thread_map *threads; + char name[256]; + int items; + struct dirent **namelist = NULL; + int i; + + sprintf(name, "/proc/%d/task", pid); + items = scandir(name, &namelist, filter, NULL); + if (items <= 0) + return NULL; + + threads = malloc(sizeof(*threads) + sizeof(pid_t) * items); + if (threads != NULL) { + for (i = 0; i < items; i++) + threads->map[i] = atoi(namelist[i]->d_name); + threads->nr = items; + } + + for (i=0; imap[0] = tid; + threads->nr = 1; + } + + return threads; +} + +struct thread_map *thread_map__new(pid_t pid, pid_t tid) +{ + if (pid != -1) + return thread_map__new_by_pid(pid); + return thread_map__new_by_tid(tid); +} + +void thread_map__delete(struct thread_map *threads) +{ + free(threads); +} diff --git a/tools/perf/util/thread_map.h b/tools/perf/util/thread_map.h new file mode 100644 index 000000000000..3cb907311409 --- /dev/null +++ b/tools/perf/util/thread_map.h @@ -0,0 +1,15 @@ +#ifndef __PERF_THREAD_MAP_H +#define __PERF_THREAD_MAP_H + +#include + +struct thread_map { + int nr; + int map[]; +}; + +struct thread_map *thread_map__new_by_pid(pid_t pid); +struct thread_map *thread_map__new_by_tid(pid_t tid); +struct thread_map *thread_map__new(pid_t pid, pid_t tid); +void thread_map__delete(struct thread_map *threads); +#endif /* __PERF_THREAD_MAP_H */ From d0dd74e853a0a6f37e8061d6d50be41c7034c54c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 21 Jan 2011 13:46:41 -0200 Subject: [PATCH 037/706] perf tools: Move event__parse_sample to evsel.c To avoid linking more stuff in the python binding I'm working on, future csets will make the sample type be taken from the evsel itself, but for that we need to first have one file per cpu and per sample_type, not a single perf.data file. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-test.c | 11 +--- tools/perf/builtin-top.c | 2 +- tools/perf/util/event.c | 125 -------------------------------------- tools/perf/util/event.h | 5 +- tools/perf/util/evsel.c | 118 +++++++++++++++++++++++++++++++++++ tools/perf/util/session.c | 4 +- tools/perf/util/session.h | 9 +++ 7 files changed, 134 insertions(+), 140 deletions(-) diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c index dc91ee06a37c..231e3e21810c 100644 --- a/tools/perf/builtin-test.c +++ b/tools/perf/builtin-test.c @@ -10,7 +10,6 @@ #include "util/evlist.h" #include "util/parse-options.h" #include "util/parse-events.h" -#include "util/session.h" #include "util/symbol.h" #include "util/thread_map.h" @@ -457,7 +456,6 @@ static int test__basic_mmap(void) int err = -1; event_t *event; struct thread_map *threads; - struct perf_session session; struct cpu_map *cpus; struct perf_evlist *evlist; struct perf_event_attr attr = { @@ -521,13 +519,6 @@ static int test__basic_mmap(void) attr.wakeup_events = 1; attr.sample_period = 1; - /* - * FIXME: use evsel->attr.sample_type in event__parse_sample. - * This will nicely remove the requirement that we have - * all the events with the same sample_type. - */ - session.sample_type = attr.sample_type; - for (i = 0; i < nsyscalls; ++i) { attr.config = ids[i]; evsels[i] = perf_evsel__new(&attr, i); @@ -567,7 +558,7 @@ static int test__basic_mmap(void) goto out_munmap; } - event__parse_sample(event, &session, &sample); + event__parse_sample(event, attr.sample_type, false, &sample); evsel = perf_evlist__id2evsel(evlist, sample.id); if (evsel == NULL) { pr_debug("event with id %" PRIu64 diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index d0b16d905405..ce2e50c891c7 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1106,7 +1106,7 @@ static void perf_session__mmap_read_cpu(struct perf_session *self, int cpu) event_t *event; while ((event = perf_evlist__read_on_cpu(evsel_list, cpu)) != NULL) { - event__parse_sample(event, self, &sample); + perf_session__parse_sample(self, event, &sample); if (event->header.type == PERF_RECORD_SAMPLE) event__process_sample(event, &sample, self); diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 1478ab4ee222..e4db8b888546 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -826,128 +826,3 @@ out_filtered: al->filtered = true; return 0; } - -static int event__parse_id_sample(const event_t *event, - struct perf_session *session, - struct sample_data *sample) -{ - const u64 *array; - u64 type; - - sample->cpu = sample->pid = sample->tid = -1; - sample->stream_id = sample->id = sample->time = -1ULL; - - if (!session->sample_id_all) - return 0; - - array = event->sample.array; - array += ((event->header.size - - sizeof(event->header)) / sizeof(u64)) - 1; - type = session->sample_type; - - if (type & PERF_SAMPLE_CPU) { - u32 *p = (u32 *)array; - sample->cpu = *p; - array--; - } - - if (type & PERF_SAMPLE_STREAM_ID) { - sample->stream_id = *array; - array--; - } - - if (type & PERF_SAMPLE_ID) { - sample->id = *array; - array--; - } - - if (type & PERF_SAMPLE_TIME) { - sample->time = *array; - array--; - } - - if (type & PERF_SAMPLE_TID) { - u32 *p = (u32 *)array; - sample->pid = p[0]; - sample->tid = p[1]; - } - - return 0; -} - -int event__parse_sample(const event_t *event, struct perf_session *session, - struct sample_data *data) -{ - const u64 *array; - u64 type; - - if (event->header.type != PERF_RECORD_SAMPLE) - return event__parse_id_sample(event, session, data); - - array = event->sample.array; - type = session->sample_type; - - if (type & PERF_SAMPLE_IP) { - data->ip = event->ip.ip; - array++; - } - - if (type & PERF_SAMPLE_TID) { - u32 *p = (u32 *)array; - data->pid = p[0]; - data->tid = p[1]; - array++; - } - - if (type & PERF_SAMPLE_TIME) { - data->time = *array; - array++; - } - - if (type & PERF_SAMPLE_ADDR) { - data->addr = *array; - array++; - } - - data->id = -1ULL; - if (type & PERF_SAMPLE_ID) { - data->id = *array; - array++; - } - - if (type & PERF_SAMPLE_STREAM_ID) { - data->stream_id = *array; - array++; - } - - if (type & PERF_SAMPLE_CPU) { - u32 *p = (u32 *)array; - data->cpu = *p; - array++; - } else - data->cpu = -1; - - if (type & PERF_SAMPLE_PERIOD) { - data->period = *array; - array++; - } - - if (type & PERF_SAMPLE_READ) { - pr_debug("PERF_SAMPLE_READ is unsuported for now\n"); - return -1; - } - - if (type & PERF_SAMPLE_CALLCHAIN) { - data->callchain = (struct ip_callchain *)array; - array += 1 + data->callchain->nr; - } - - if (type & PERF_SAMPLE_RAW) { - u32 *p = (u32 *)array; - data->raw_size = *p; - p++; - data->raw_data = p; - } - - return 0; -} diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index 2b7e91902f10..d79e4edd82f9 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -169,9 +169,10 @@ struct addr_location; int event__preprocess_sample(const event_t *self, struct perf_session *session, struct addr_location *al, struct sample_data *data, symbol_filter_t filter); -int event__parse_sample(const event_t *event, struct perf_session *session, - struct sample_data *sample); const char *event__get_event_name(unsigned int id); +int event__parse_sample(const event_t *event, u64 type, bool sample_id_all, + struct sample_data *sample); + #endif /* __PERF_RECORD_H */ diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 9a6d94299ab8..a85ae12845ea 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -355,3 +355,121 @@ out_unmap: } return -1; } + +static int event__parse_id_sample(const event_t *event, u64 type, + struct sample_data *sample) +{ + const u64 *array = event->sample.array; + + array += ((event->header.size - + sizeof(event->header)) / sizeof(u64)) - 1; + + if (type & PERF_SAMPLE_CPU) { + u32 *p = (u32 *)array; + sample->cpu = *p; + array--; + } + + if (type & PERF_SAMPLE_STREAM_ID) { + sample->stream_id = *array; + array--; + } + + if (type & PERF_SAMPLE_ID) { + sample->id = *array; + array--; + } + + if (type & PERF_SAMPLE_TIME) { + sample->time = *array; + array--; + } + + if (type & PERF_SAMPLE_TID) { + u32 *p = (u32 *)array; + sample->pid = p[0]; + sample->tid = p[1]; + } + + return 0; +} + +int event__parse_sample(const event_t *event, u64 type, bool sample_id_all, + struct sample_data *data) +{ + const u64 *array; + + data->cpu = data->pid = data->tid = -1; + data->stream_id = data->id = data->time = -1ULL; + + if (event->header.type != PERF_RECORD_SAMPLE) { + if (!sample_id_all) + return 0; + return event__parse_id_sample(event, type, data); + } + + array = event->sample.array; + + if (type & PERF_SAMPLE_IP) { + data->ip = event->ip.ip; + array++; + } + + if (type & PERF_SAMPLE_TID) { + u32 *p = (u32 *)array; + data->pid = p[0]; + data->tid = p[1]; + array++; + } + + if (type & PERF_SAMPLE_TIME) { + data->time = *array; + array++; + } + + if (type & PERF_SAMPLE_ADDR) { + data->addr = *array; + array++; + } + + data->id = -1ULL; + if (type & PERF_SAMPLE_ID) { + data->id = *array; + array++; + } + + if (type & PERF_SAMPLE_STREAM_ID) { + data->stream_id = *array; + array++; + } + + if (type & PERF_SAMPLE_CPU) { + u32 *p = (u32 *)array; + data->cpu = *p; + array++; + } + + if (type & PERF_SAMPLE_PERIOD) { + data->period = *array; + array++; + } + + if (type & PERF_SAMPLE_READ) { + fprintf(stderr, "PERF_SAMPLE_READ is unsuported for now\n"); + return -1; + } + + if (type & PERF_SAMPLE_CALLCHAIN) { + data->callchain = (struct ip_callchain *)array; + array += 1 + data->callchain->nr; + } + + if (type & PERF_SAMPLE_RAW) { + u32 *p = (u32 *)array; + data->raw_size = *p; + p++; + data->raw_data = p; + } + + return 0; +} diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index b58a48a5e5a9..e6a07408669e 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -496,7 +496,7 @@ static void flush_sample_queue(struct perf_session *s, if (iter->timestamp > limit) break; - event__parse_sample(iter->event, s, &sample); + perf_session__parse_sample(s, iter->event, &sample); perf_session_deliver_event(s, iter->event, &sample, ops, iter->file_offset); @@ -806,7 +806,7 @@ static int perf_session__process_event(struct perf_session *session, /* * For all kernel events we get the sample data */ - event__parse_sample(event, session, &sample); + perf_session__parse_sample(session, event, &sample); /* Preprocess sample records - precheck callchains */ if (perf_session__preprocess_sample(session, event, &sample)) diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index e815468eb888..78239767011e 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -155,4 +155,13 @@ size_t perf_session__fprintf_nr_events(struct perf_session *self, FILE *fp) { return hists__fprintf_nr_events(&self->hists, fp); } + +static inline int perf_session__parse_sample(struct perf_session *session, + const event_t *event, + struct sample_data *sample) +{ + return event__parse_sample(event, session->sample_type, + session->sample_id_all, sample); +} + #endif /* __PERF_SESSION_H */ From ef1d1af28ca37fdbc2745da040529cd2953c1af5 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 18 Jan 2011 21:41:45 -0200 Subject: [PATCH 038/706] perf evsel: Introduce perf_evsel__{in,ex}it Out of the {con,des}structor, as in interpreted language bindings we will need to go back from the wrapper object to the real thing. In that case using container_of will save us to have an extra pointer in the perf_evsel struct. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 29 ++++++++++++++++++++--------- tools/perf/util/evlist.h | 2 ++ tools/perf/util/evsel.c | 22 ++++++++++++++++------ tools/perf/util/evsel.h | 3 +++ 4 files changed, 41 insertions(+), 15 deletions(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 4b3b84cd71a1..df0610e9c61b 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -6,17 +6,21 @@ #include #include +void perf_evlist__init(struct perf_evlist *evlist) +{ + int i; + + for (i = 0; i < PERF_EVLIST__HLIST_SIZE; ++i) + INIT_HLIST_HEAD(&evlist->heads[i]); + INIT_LIST_HEAD(&evlist->entries); +} + struct perf_evlist *perf_evlist__new(void) { struct perf_evlist *evlist = zalloc(sizeof(*evlist)); - if (evlist != NULL) { - int i; - - for (i = 0; i < PERF_EVLIST__HLIST_SIZE; ++i) - INIT_HLIST_HEAD(&evlist->heads[i]); - INIT_LIST_HEAD(&evlist->entries); - } + if (evlist != NULL) + perf_evlist__init(evlist); return evlist; } @@ -33,11 +37,18 @@ static void perf_evlist__purge(struct perf_evlist *evlist) evlist->nr_entries = 0; } +void perf_evlist__exit(struct perf_evlist *evlist) +{ + free(evlist->mmap); + free(evlist->pollfd); + evlist->mmap = NULL; + evlist->pollfd = NULL; +} + void perf_evlist__delete(struct perf_evlist *evlist) { perf_evlist__purge(evlist); - free(evlist->mmap); - free(evlist->pollfd); + perf_evlist__exit(evlist); free(evlist); } diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 28712063db97..acbe48eac608 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -24,6 +24,8 @@ struct perf_evlist { struct perf_evsel; struct perf_evlist *perf_evlist__new(void); +void perf_evlist__init(struct perf_evlist *evlist); +void perf_evlist__exit(struct perf_evlist *evlist); void perf_evlist__delete(struct perf_evlist *evlist); void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry); diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index a85ae12845ea..76ab553637d6 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -14,15 +14,20 @@ #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y)) #define SID(e, x, y) xyarray__entry(e->id, x, y) +void perf_evsel__init(struct perf_evsel *evsel, + struct perf_event_attr *attr, int idx) +{ + evsel->idx = idx; + evsel->attr = *attr; + INIT_LIST_HEAD(&evsel->node); +} + struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx) { struct perf_evsel *evsel = zalloc(sizeof(*evsel)); - if (evsel != NULL) { - evsel->idx = idx; - evsel->attr = *attr; - INIT_LIST_HEAD(&evsel->node); - } + if (evsel != NULL) + perf_evsel__init(evsel, attr, idx); return evsel; } @@ -87,11 +92,16 @@ int perf_evlist__alloc_mmap(struct perf_evlist *evlist, int ncpus) return evlist->mmap != NULL ? 0 : -ENOMEM; } -void perf_evsel__delete(struct perf_evsel *evsel) +void perf_evsel__exit(struct perf_evsel *evsel) { assert(list_empty(&evsel->node)); xyarray__delete(evsel->fd); xyarray__delete(evsel->id); +} + +void perf_evsel__delete(struct perf_evsel *evsel) +{ + perf_evsel__exit(evsel); free(evsel); } diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 667ee4e2e35e..7962e7587dea 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -52,6 +52,9 @@ struct thread_map; struct perf_evlist; struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx); +void perf_evsel__init(struct perf_evsel *evsel, + struct perf_event_attr *attr, int idx); +void perf_evsel__exit(struct perf_evsel *evsel); void perf_evsel__delete(struct perf_evsel *evsel); int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads); From 9599ec0471deae24044241e2173090d2cbc0e899 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 17 Jan 2011 17:39:15 -0800 Subject: [PATCH 039/706] x86-64, mem: Convert memmove() to assembly file and fix return value bug memmove_64.c only implements memmove() function which is completely written in inline assembly code. Therefore it doesn't make sense to keep the assembly code in .c file. Currently memmove() doesn't store return value to rax. This may cause issue if caller uses the return value. The patch fixes this issue. Signed-off-by: Fenghua Yu LKML-Reference: <1295314755-6625-1-git-send-email-fenghua.yu@intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/x8664_ksyms_64.c | 1 + arch/x86/lib/memmove_64.S | 197 +++++++++++++++++++++++++++++++ arch/x86/lib/memmove_64.c | 192 ------------------------------ 3 files changed, 198 insertions(+), 192 deletions(-) create mode 100644 arch/x86/lib/memmove_64.S delete mode 100644 arch/x86/lib/memmove_64.c diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 1b950d151e58..9796c2f3d074 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -52,6 +52,7 @@ extern void *__memcpy(void *, const void *, __kernel_size_t); EXPORT_SYMBOL(memset); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(__memcpy); +EXPORT_SYMBOL(memmove); EXPORT_SYMBOL(empty_zero_page); #ifndef CONFIG_PARAVIRT diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S new file mode 100644 index 000000000000..0ecb8433e5a8 --- /dev/null +++ b/arch/x86/lib/memmove_64.S @@ -0,0 +1,197 @@ +/* + * Normally compiler builtins are used, but sometimes the compiler calls out + * of line code. Based on asm-i386/string.h. + * + * This assembly file is re-written from memmove_64.c file. + * - Copyright 2011 Fenghua Yu + */ +#define _STRING_C +#include +#include + +#undef memmove + +/* + * Implement memmove(). This can handle overlap between src and dst. + * + * Input: + * rdi: dest + * rsi: src + * rdx: count + * + * Output: + * rax: dest + */ +ENTRY(memmove) + CFI_STARTPROC + /* Handle more 32bytes in loop */ + mov %rdi, %rax + cmp $0x20, %rdx + jb 1f + + /* Decide forward/backward copy mode */ + cmp %rdi, %rsi + jb 2f + + /* + * movsq instruction have many startup latency + * so we handle small size by general register. + */ + cmp $680, %rdx + jb 3f + /* + * movsq instruction is only good for aligned case. + */ + + cmpb %dil, %sil + je 4f +3: + sub $0x20, %rdx + /* + * We gobble 32byts forward in each loop. + */ +5: + sub $0x20, %rdx + movq 0*8(%rsi), %r11 + movq 1*8(%rsi), %r10 + movq 2*8(%rsi), %r9 + movq 3*8(%rsi), %r8 + leaq 4*8(%rsi), %rsi + + movq %r11, 0*8(%rdi) + movq %r10, 1*8(%rdi) + movq %r9, 2*8(%rdi) + movq %r8, 3*8(%rdi) + leaq 4*8(%rdi), %rdi + jae 5b + addq $0x20, %rdx + jmp 1f + /* + * Handle data forward by movsq. + */ + .p2align 4 +4: + movq %rdx, %rcx + movq -8(%rsi, %rdx), %r11 + lea -8(%rdi, %rdx), %r10 + shrq $3, %rcx + rep movsq + movq %r11, (%r10) + jmp 13f + /* + * Handle data backward by movsq. + */ + .p2align 4 +7: + movq %rdx, %rcx + movq (%rsi), %r11 + movq %rdi, %r10 + leaq -8(%rsi, %rdx), %rsi + leaq -8(%rdi, %rdx), %rdi + shrq $3, %rcx + std + rep movsq + cld + movq %r11, (%r10) + jmp 13f + + /* + * Start to prepare for backward copy. + */ + .p2align 4 +2: + cmp $680, %rdx + jb 6f + cmp %dil, %sil + je 7b +6: + /* + * Calculate copy position to tail. + */ + addq %rdx, %rsi + addq %rdx, %rdi + subq $0x20, %rdx + /* + * We gobble 32byts backward in each loop. + */ +8: + subq $0x20, %rdx + movq -1*8(%rsi), %r11 + movq -2*8(%rsi), %r10 + movq -3*8(%rsi), %r9 + movq -4*8(%rsi), %r8 + leaq -4*8(%rsi), %rsi + + movq %r11, -1*8(%rdi) + movq %r10, -2*8(%rdi) + movq %r9, -3*8(%rdi) + movq %r8, -4*8(%rdi) + leaq -4*8(%rdi), %rdi + jae 8b + /* + * Calculate copy position to head. + */ + addq $0x20, %rdx + subq %rdx, %rsi + subq %rdx, %rdi +1: + cmpq $16, %rdx + jb 9f + /* + * Move data from 16 bytes to 31 bytes. + */ + movq 0*8(%rsi), %r11 + movq 1*8(%rsi), %r10 + movq -2*8(%rsi, %rdx), %r9 + movq -1*8(%rsi, %rdx), %r8 + movq %r11, 0*8(%rdi) + movq %r10, 1*8(%rdi) + movq %r9, -2*8(%rdi, %rdx) + movq %r8, -1*8(%rdi, %rdx) + jmp 13f + .p2align 4 +9: + cmpq $8, %rdx + jb 10f + /* + * Move data from 8 bytes to 15 bytes. + */ + movq 0*8(%rsi), %r11 + movq -1*8(%rsi, %rdx), %r10 + movq %r11, 0*8(%rdi) + movq %r10, -1*8(%rdi, %rdx) + jmp 13f +10: + cmpq $4, %rdx + jb 11f + /* + * Move data from 4 bytes to 7 bytes. + */ + movl (%rsi), %r11d + movl -4(%rsi, %rdx), %r10d + movl %r11d, (%rdi) + movl %r10d, -4(%rdi, %rdx) + jmp 13f +11: + cmp $2, %rdx + jb 12f + /* + * Move data from 2 bytes to 3 bytes. + */ + movw (%rsi), %r11w + movw -2(%rsi, %rdx), %r10w + movw %r11w, (%rdi) + movw %r10w, -2(%rdi, %rdx) + jmp 13f +12: + cmp $1, %rdx + jb 13f + /* + * Move data for 1 byte. + */ + movb (%rsi), %r11b + movb %r11b, (%rdi) +13: + retq + CFI_ENDPROC +ENDPROC(memmove) diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c deleted file mode 100644 index 6d0f0ec41b34..000000000000 --- a/arch/x86/lib/memmove_64.c +++ /dev/null @@ -1,192 +0,0 @@ -/* Normally compiler builtins are used, but sometimes the compiler calls out - of line code. Based on asm-i386/string.h. - */ -#define _STRING_C -#include -#include - -#undef memmove -void *memmove(void *dest, const void *src, size_t count) -{ - unsigned long d0,d1,d2,d3,d4,d5,d6,d7; - char *ret; - - __asm__ __volatile__( - /* Handle more 32bytes in loop */ - "mov %2, %3\n\t" - "cmp $0x20, %0\n\t" - "jb 1f\n\t" - - /* Decide forward/backward copy mode */ - "cmp %2, %1\n\t" - "jb 2f\n\t" - - /* - * movsq instruction have many startup latency - * so we handle small size by general register. - */ - "cmp $680, %0\n\t" - "jb 3f\n\t" - /* - * movsq instruction is only good for aligned case. - */ - "cmpb %%dil, %%sil\n\t" - "je 4f\n\t" - "3:\n\t" - "sub $0x20, %0\n\t" - /* - * We gobble 32byts forward in each loop. - */ - "5:\n\t" - "sub $0x20, %0\n\t" - "movq 0*8(%1), %4\n\t" - "movq 1*8(%1), %5\n\t" - "movq 2*8(%1), %6\n\t" - "movq 3*8(%1), %7\n\t" - "leaq 4*8(%1), %1\n\t" - - "movq %4, 0*8(%2)\n\t" - "movq %5, 1*8(%2)\n\t" - "movq %6, 2*8(%2)\n\t" - "movq %7, 3*8(%2)\n\t" - "leaq 4*8(%2), %2\n\t" - "jae 5b\n\t" - "addq $0x20, %0\n\t" - "jmp 1f\n\t" - /* - * Handle data forward by movsq. - */ - ".p2align 4\n\t" - "4:\n\t" - "movq %0, %8\n\t" - "movq -8(%1, %0), %4\n\t" - "lea -8(%2, %0), %5\n\t" - "shrq $3, %8\n\t" - "rep movsq\n\t" - "movq %4, (%5)\n\t" - "jmp 13f\n\t" - /* - * Handle data backward by movsq. - */ - ".p2align 4\n\t" - "7:\n\t" - "movq %0, %8\n\t" - "movq (%1), %4\n\t" - "movq %2, %5\n\t" - "leaq -8(%1, %0), %1\n\t" - "leaq -8(%2, %0), %2\n\t" - "shrq $3, %8\n\t" - "std\n\t" - "rep movsq\n\t" - "cld\n\t" - "movq %4, (%5)\n\t" - "jmp 13f\n\t" - - /* - * Start to prepare for backward copy. - */ - ".p2align 4\n\t" - "2:\n\t" - "cmp $680, %0\n\t" - "jb 6f \n\t" - "cmp %%dil, %%sil\n\t" - "je 7b \n\t" - "6:\n\t" - /* - * Calculate copy position to tail. - */ - "addq %0, %1\n\t" - "addq %0, %2\n\t" - "subq $0x20, %0\n\t" - /* - * We gobble 32byts backward in each loop. - */ - "8:\n\t" - "subq $0x20, %0\n\t" - "movq -1*8(%1), %4\n\t" - "movq -2*8(%1), %5\n\t" - "movq -3*8(%1), %6\n\t" - "movq -4*8(%1), %7\n\t" - "leaq -4*8(%1), %1\n\t" - - "movq %4, -1*8(%2)\n\t" - "movq %5, -2*8(%2)\n\t" - "movq %6, -3*8(%2)\n\t" - "movq %7, -4*8(%2)\n\t" - "leaq -4*8(%2), %2\n\t" - "jae 8b\n\t" - /* - * Calculate copy position to head. - */ - "addq $0x20, %0\n\t" - "subq %0, %1\n\t" - "subq %0, %2\n\t" - "1:\n\t" - "cmpq $16, %0\n\t" - "jb 9f\n\t" - /* - * Move data from 16 bytes to 31 bytes. - */ - "movq 0*8(%1), %4\n\t" - "movq 1*8(%1), %5\n\t" - "movq -2*8(%1, %0), %6\n\t" - "movq -1*8(%1, %0), %7\n\t" - "movq %4, 0*8(%2)\n\t" - "movq %5, 1*8(%2)\n\t" - "movq %6, -2*8(%2, %0)\n\t" - "movq %7, -1*8(%2, %0)\n\t" - "jmp 13f\n\t" - ".p2align 4\n\t" - "9:\n\t" - "cmpq $8, %0\n\t" - "jb 10f\n\t" - /* - * Move data from 8 bytes to 15 bytes. - */ - "movq 0*8(%1), %4\n\t" - "movq -1*8(%1, %0), %5\n\t" - "movq %4, 0*8(%2)\n\t" - "movq %5, -1*8(%2, %0)\n\t" - "jmp 13f\n\t" - "10:\n\t" - "cmpq $4, %0\n\t" - "jb 11f\n\t" - /* - * Move data from 4 bytes to 7 bytes. - */ - "movl (%1), %4d\n\t" - "movl -4(%1, %0), %5d\n\t" - "movl %4d, (%2)\n\t" - "movl %5d, -4(%2, %0)\n\t" - "jmp 13f\n\t" - "11:\n\t" - "cmp $2, %0\n\t" - "jb 12f\n\t" - /* - * Move data from 2 bytes to 3 bytes. - */ - "movw (%1), %4w\n\t" - "movw -2(%1, %0), %5w\n\t" - "movw %4w, (%2)\n\t" - "movw %5w, -2(%2, %0)\n\t" - "jmp 13f\n\t" - "12:\n\t" - "cmp $1, %0\n\t" - "jb 13f\n\t" - /* - * Move data for 1 byte. - */ - "movb (%1), %4b\n\t" - "movb %4b, (%2)\n\t" - "13:\n\t" - : "=&d" (d0), "=&S" (d1), "=&D" (d2), "=&a" (ret) , - "=r"(d3), "=r"(d4), "=r"(d5), "=r"(d6), "=&c" (d7) - :"0" (count), - "1" (src), - "2" (dest) - :"memory"); - - return ret; - -} -EXPORT_SYMBOL(memmove); From b3d7336db553d318e7ec042eb50a70d307013339 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 21 Jan 2011 15:29:44 -0800 Subject: [PATCH 040/706] x86: Move llc_shared_map out of cpu_info cpu_info is already with per_cpu, We can take llc_shared_map out of cpu_info, and declare it as per_cpu variable directly. So later referencing could be simple and directly instead of diving to find cpu_info at first. Also could make smp_store_cpu_info() much simple to avoid to do save and restore trick. Signed-off-by: Yinghai Lu Cc: Hans Rosenfeld Cc: Alok N Kataria Cc: Stephen Hemminger Cc: Hans J. Koch Cc: Tejun Heo Cc: Borislav Petkov Cc: Andreas Herrmann Cc: Robert Richter Cc: Suresh Siddha LKML-Reference: <4D3A16E8.5020608@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 4 --- arch/x86/include/asm/smp.h | 7 +++++ arch/x86/kernel/cpu/intel_cacheinfo.c | 4 +-- arch/x86/kernel/cpu/mcheck/mce_amd.c | 7 ++--- arch/x86/kernel/smpboot.c | 38 +++++++-------------------- 5 files changed, 21 insertions(+), 39 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 45636cefa186..4c25ab48257b 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -94,10 +94,6 @@ struct cpuinfo_x86 { int x86_cache_alignment; /* In bytes */ int x86_power; unsigned long loops_per_jiffy; -#ifdef CONFIG_SMP - /* cpus sharing the last level cache: */ - cpumask_var_t llc_shared_map; -#endif /* cpuid returned max cores value: */ u16 x86_max_cores; u16 apicid; diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 4c2f63c7fc1b..3597825a3112 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -23,6 +23,8 @@ extern unsigned int num_processors; DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map); DECLARE_PER_CPU(cpumask_var_t, cpu_core_map); +/* cpus sharing the last level cache: */ +DECLARE_PER_CPU(cpumask_var_t, cpu_llc_shared_map); DECLARE_PER_CPU(u16, cpu_llc_id); DECLARE_PER_CPU(int, cpu_number); @@ -36,6 +38,11 @@ static inline struct cpumask *cpu_core_mask(int cpu) return per_cpu(cpu_core_map, cpu); } +static inline struct cpumask *cpu_llc_shared_mask(int cpu) +{ + return per_cpu(cpu_llc_shared_map, cpu); +} + DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid); DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index ec2c19a7b8ef..5419a263ebd1 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -732,11 +732,11 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) struct cpuinfo_x86 *c = &cpu_data(cpu); if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { - for_each_cpu(i, c->llc_shared_map) { + for_each_cpu(i, cpu_llc_shared_mask(cpu)) { if (!per_cpu(ici_cpuid4_info, i)) continue; this_leaf = CPUID4_INFO_IDX(i, index); - for_each_cpu(sibling, c->llc_shared_map) { + for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { if (!cpu_online(sibling)) continue; set_bit(sibling, this_leaf->shared_cpu_map); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 5bf2fac52aca..167f97b5596e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -527,15 +527,12 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) int i, err = 0; struct threshold_bank *b = NULL; char name[32]; -#ifdef CONFIG_SMP - struct cpuinfo_x86 *c = &cpu_data(cpu); -#endif sprintf(name, "threshold_bank%i", bank); #ifdef CONFIG_SMP if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ - i = cpumask_first(c->llc_shared_map); + i = cpumask_first(cpu_llc_shared_mask(cpu)); /* first core not up yet */ if (cpu_data(i).cpu_core_id) @@ -555,7 +552,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (err) goto out; - cpumask_copy(b->cpus, c->llc_shared_map); + cpumask_copy(b->cpus, cpu_llc_shared_mask(cpu)); per_cpu(threshold_banks, cpu)[bank] = b; goto out; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 0cbe8c0b35ed..d396155f436c 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -130,6 +130,8 @@ EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); DEFINE_PER_CPU(cpumask_var_t, cpu_core_map); EXPORT_PER_CPU_SYMBOL(cpu_core_map); +DEFINE_PER_CPU(cpumask_var_t, cpu_llc_shared_map); + /* Per CPU bogomips and other parameters */ DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); @@ -355,23 +357,6 @@ notrace static void __cpuinit start_secondary(void *unused) cpu_idle(); } -#ifdef CONFIG_CPUMASK_OFFSTACK -/* In this case, llc_shared_map is a pointer to a cpumask. */ -static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst, - const struct cpuinfo_x86 *src) -{ - struct cpumask *llc = dst->llc_shared_map; - *dst = *src; - dst->llc_shared_map = llc; -} -#else -static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst, - const struct cpuinfo_x86 *src) -{ - *dst = *src; -} -#endif /* CONFIG_CPUMASK_OFFSTACK */ - /* * The bootstrap kernel entry code has set these up. Save them for * a given CPU @@ -381,7 +366,7 @@ void __cpuinit smp_store_cpu_info(int id) { struct cpuinfo_x86 *c = &cpu_data(id); - copy_cpuinfo_x86(c, &boot_cpu_data); + *c = boot_cpu_data; c->cpu_index = id; if (id != 0) identify_secondary_cpu(c); @@ -389,15 +374,12 @@ void __cpuinit smp_store_cpu_info(int id) static void __cpuinit link_thread_siblings(int cpu1, int cpu2) { - struct cpuinfo_x86 *c1 = &cpu_data(cpu1); - struct cpuinfo_x86 *c2 = &cpu_data(cpu2); - cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2)); cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1)); cpumask_set_cpu(cpu1, cpu_core_mask(cpu2)); cpumask_set_cpu(cpu2, cpu_core_mask(cpu1)); - cpumask_set_cpu(cpu1, c2->llc_shared_map); - cpumask_set_cpu(cpu2, c1->llc_shared_map); + cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2)); + cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1)); } @@ -425,7 +407,7 @@ void __cpuinit set_cpu_sibling_map(int cpu) cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); } - cpumask_set_cpu(cpu, c->llc_shared_map); + cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); if (__this_cpu_read(cpu_info.x86_max_cores) == 1) { cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu)); @@ -436,8 +418,8 @@ void __cpuinit set_cpu_sibling_map(int cpu) for_each_cpu(i, cpu_sibling_setup_mask) { if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { - cpumask_set_cpu(i, c->llc_shared_map); - cpumask_set_cpu(cpu, cpu_data(i).llc_shared_map); + cpumask_set_cpu(i, cpu_llc_shared_mask(cpu)); + cpumask_set_cpu(cpu, cpu_llc_shared_mask(i)); } if (c->phys_proc_id == cpu_data(i).phys_proc_id) { cpumask_set_cpu(i, cpu_core_mask(cpu)); @@ -476,7 +458,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu) !(cpu_has(c, X86_FEATURE_AMD_DCM))) return cpu_core_mask(cpu); else - return c->llc_shared_map; + return cpu_llc_shared_mask(cpu); } static void impress_friends(void) @@ -1103,7 +1085,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) for_each_possible_cpu(i) { zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); - zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); } set_cpu_sibling_map(0); From 792363d2beceb1c7d865e517fa9939c8b8c1442a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 21 Jan 2011 15:29:54 -0800 Subject: [PATCH 041/706] x86: Don't copy per_cpu cpuinfo for BSP two times smp_store_cpu_info(0) will do that. Signed-off-by: Yinghai Lu Cc: Suresh Siddha Cc: Tejun Heo Cc: Borislav Petkov LKML-Reference: <4D3A16F2.5090902@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index d396155f436c..a7a3d1acc632 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1071,13 +1071,13 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) preempt_disable(); smp_cpu_index_default(); - memcpy(__this_cpu_ptr(&cpu_info), &boot_cpu_data, sizeof(cpu_info)); - cpumask_copy(cpu_callin_mask, cpumask_of(0)); - mb(); + /* * Setup boot CPU information */ smp_store_cpu_info(0); /* Final full version of the data */ + cpumask_copy(cpu_callin_mask, cpumask_of(0)); + mb(); #ifdef CONFIG_X86_32 boot_cpu_logical_apicid = logical_smp_processor_id(); #endif From 6d5ab2932a21ea54406ab95c43ecff90a3eddfda Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Fri, 21 Jan 2011 20:45:01 -0800 Subject: [PATCH 042/706] sched: Simplify update_cfs_shares parameters Re-visiting this: Since update_cfs_shares will now only ever re-weight an entity that is a relative parent of the current entity in enqueue_entity; we can safely issue the account_entity_enqueue relative to that cfs_rq and avoid the requirement for special handling of the enqueue case in update_cfs_shares. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20110122044851.915214637@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- kernel/sched_fair.c | 30 ++++++++++++++---------------- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 18d38e4ec7ba..e0fa3ff7f194 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8510,7 +8510,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) /* Propagate contribution to hierarchy */ raw_spin_lock_irqsave(&rq->lock, flags); for_each_sched_entity(se) - update_cfs_shares(group_cfs_rq(se), 0); + update_cfs_shares(group_cfs_rq(se)); raw_spin_unlock_irqrestore(&rq->lock, flags); } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0c26e2df450e..0c550c841eee 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -540,7 +540,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); -static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta); +static void update_cfs_shares(struct cfs_rq *cfs_rq); /* * Update the current task's runtime statistics. Skip current tasks that @@ -763,16 +763,15 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) list_del_leaf_cfs_rq(cfs_rq); } -static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, - long weight_delta) +static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) { long load_weight, load, shares; - load = cfs_rq->load.weight + weight_delta; + load = cfs_rq->load.weight; load_weight = atomic_read(&tg->load_weight); - load_weight -= cfs_rq->load_contribution; load_weight += load; + load_weight -= cfs_rq->load_contribution; shares = (tg->shares * load); if (load_weight) @@ -790,7 +789,7 @@ static void update_entity_shares_tick(struct cfs_rq *cfs_rq) { if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq, 0); + update_cfs_shares(cfs_rq); } } # else /* CONFIG_SMP */ @@ -798,8 +797,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) { } -static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, - long weight_delta) +static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) { return tg->shares; } @@ -824,7 +822,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, account_entity_enqueue(cfs_rq, se); } -static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) +static void update_cfs_shares(struct cfs_rq *cfs_rq) { struct task_group *tg; struct sched_entity *se; @@ -838,7 +836,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) if (likely(se->load.weight == tg->shares)) return; #endif - shares = calc_cfs_shares(cfs_rq, tg, weight_delta); + shares = calc_cfs_shares(cfs_rq, tg); reweight_entity(cfs_rq_of(se), se, shares); } @@ -847,7 +845,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) { } -static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) +static inline void update_cfs_shares(struct cfs_rq *cfs_rq) { } @@ -978,8 +976,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ update_curr(cfs_rq); update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq, se->load.weight); account_entity_enqueue(cfs_rq, se); + update_cfs_shares(cfs_rq); if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); @@ -1041,7 +1039,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_cfs_load(cfs_rq, 0); account_entity_dequeue(cfs_rq, se); update_min_vruntime(cfs_rq); - update_cfs_shares(cfs_rq, 0); + update_cfs_shares(cfs_rq); /* * Normalize the entity after updating the min_vruntime because the @@ -1282,7 +1280,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq = cfs_rq_of(se); update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq, 0); + update_cfs_shares(cfs_rq); } hrtick_update(rq); @@ -1312,7 +1310,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq = cfs_rq_of(se); update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq, 0); + update_cfs_shares(cfs_rq); } hrtick_update(rq); @@ -2123,7 +2121,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu) * We need to update shares after updating tg->load_weight in * order to adjust the weight of groups with long running tasks. */ - update_cfs_shares(cfs_rq, 0); + update_cfs_shares(cfs_rq); raw_spin_unlock_irqrestore(&rq->lock, flags); From f07333bf6ee66d9b49286cec4371cf375e745b7a Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Fri, 21 Jan 2011 20:45:03 -0800 Subject: [PATCH 043/706] sched: Avoid expensive initial update_cfs_load() Since cfs->{load_stamp,load_last} are zero-initalized the initial load update will consider the delta to be 'since the beginning of time'. This results in a lot of pointless divisions to bring this large period to be within the sysctl_sched_shares_window. Fix this by initializing load_stamp to be 1 at cfs_rq initialization, this allows for an initial load_stamp > load_last which then lets standard idle truncation proceed. We avoid spinning (and slightly improve consistency) by fixing delta to be [period - 1] in this path resulting in a slightly more predictable shares ramp. (Previously the amount of idle time preserved by the overflow would range between [period/2,period-1].) Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20110122044852.102126037@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ kernel/sched_fair.c | 1 + 2 files changed, 3 insertions(+) diff --git a/kernel/sched.c b/kernel/sched.c index e0fa3ff7f194..6820b5b3a969 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7796,6 +7796,8 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) INIT_LIST_HEAD(&cfs_rq->tasks); #ifdef CONFIG_FAIR_GROUP_SCHED cfs_rq->rq = rq; + /* allow initial update_cfs_load() to truncate */ + cfs_rq->load_stamp = 1; #endif cfs_rq->min_vruntime = (u64)(-(1LL << 20)); } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0c550c841eee..4cbc9121094c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -733,6 +733,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) now - cfs_rq->load_last > 4 * period) { cfs_rq->load_period = 0; cfs_rq->load_avg = 0; + delta = period - 1; } cfs_rq->load_stamp = now; From 4dd53d891ca46dcc1fde0376a33540d3fd83cb9a Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Tue, 21 Dec 2010 17:09:00 -0800 Subject: [PATCH 044/706] softirqs: Free up pf flag PF_KSOFTIRQD Cleanup patch, freeing up PF_KSOFTIRQD and use per_cpu ksoftirqd pointer instead, as suggested by Eric Dumazet. Tested-by: Shaun Ruffell Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1292980144-28796-2-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- include/linux/interrupt.h | 7 +++++++ include/linux/sched.h | 1 - kernel/sched.c | 2 +- kernel/softirq.c | 3 +-- 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 55e0d4253e49..a1382b9b5813 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -426,6 +426,13 @@ extern void raise_softirq(unsigned int nr); */ DECLARE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); +DECLARE_PER_CPU(struct task_struct *, ksoftirqd); + +static inline struct task_struct *this_cpu_ksoftirqd(void) +{ + return this_cpu_read(ksoftirqd); +} + /* Try to send a softirq to a remote cpu. If this cannot be done, the * work will be queued to the local cpu. */ diff --git a/include/linux/sched.h b/include/linux/sched.h index d747f948b34e..af6e15fbfb78 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1715,7 +1715,6 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * /* * Per process flags */ -#define PF_KSOFTIRQD 0x00000001 /* I am ksoftirqd */ #define PF_STARTING 0x00000002 /* being created */ #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ diff --git a/kernel/sched.c b/kernel/sched.c index 6820b5b3a969..8b89b3bba565 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1880,7 +1880,7 @@ void account_system_vtime(struct task_struct *curr) */ if (hardirq_count()) __this_cpu_add(cpu_hardirq_time, delta); - else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) + else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) __this_cpu_add(cpu_softirq_time, delta); irq_time_write_end(); diff --git a/kernel/softirq.c b/kernel/softirq.c index 68eb5efec388..0cee50487629 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -54,7 +54,7 @@ EXPORT_SYMBOL(irq_stat); static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; -static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); +DEFINE_PER_CPU(struct task_struct *, ksoftirqd); char *softirq_to_name[NR_SOFTIRQS] = { "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", @@ -721,7 +721,6 @@ static int run_ksoftirqd(void * __bind_cpu) { set_current_state(TASK_INTERRUPTIBLE); - current->flags |= PF_KSOFTIRQD; while (!kthread_should_stop()) { preempt_disable(); if (!local_softirq_pending()) { From a1dabb6bfffccb897eff3e1d102dacf2a4bedf3b Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Tue, 21 Dec 2010 17:09:01 -0800 Subject: [PATCH 045/706] time: Add nsecs_to_cputime64 interface for asm-generic Add nsecs_to_cputime64 interface. This is used in following patches that updates cpu irq stat based on ns granularity info in IRQ_TIME_ACCOUNTING. Tested-by: Shaun Ruffell Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1292980144-28796-3-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- include/asm-generic/cputime.h | 3 +++ include/linux/jiffies.h | 1 + kernel/time.c | 23 +++++++++++++++++++++-- 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/include/asm-generic/cputime.h b/include/asm-generic/cputime.h index 2bcc5c7c22a6..61e03dd7939e 100644 --- a/include/asm-generic/cputime.h +++ b/include/asm-generic/cputime.h @@ -30,6 +30,9 @@ typedef u64 cputime64_t; #define cputime64_to_jiffies64(__ct) (__ct) #define jiffies64_to_cputime64(__jif) (__jif) #define cputime_to_cputime64(__ct) ((u64) __ct) +#define cputime64_gt(__a, __b) ((__a) > (__b)) + +#define nsecs_to_cputime64(__ct) nsecs_to_jiffies64(__ct) /* diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 6811f4bfc6e7..922aa313c9f9 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -307,6 +307,7 @@ extern clock_t jiffies_to_clock_t(long x); extern unsigned long clock_t_to_jiffies(unsigned long x); extern u64 jiffies_64_to_clock_t(u64 x); extern u64 nsec_to_clock_t(u64 x); +extern u64 nsecs_to_jiffies64(u64 n); extern unsigned long nsecs_to_jiffies(u64 n); #define TIMESTAMP_SIZE 30 diff --git a/kernel/time.c b/kernel/time.c index 32174359576f..55337a816b20 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -645,7 +645,7 @@ u64 nsec_to_clock_t(u64 x) } /** - * nsecs_to_jiffies - Convert nsecs in u64 to jiffies + * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64 * * @n: nsecs in u64 * @@ -657,7 +657,7 @@ u64 nsec_to_clock_t(u64 x) * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years */ -unsigned long nsecs_to_jiffies(u64 n) +u64 nsecs_to_jiffies64(u64 n) { #if (NSEC_PER_SEC % HZ) == 0 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ @@ -674,6 +674,25 @@ unsigned long nsecs_to_jiffies(u64 n) #endif } + +/** + * nsecs_to_jiffies - Convert nsecs in u64 to jiffies + * + * @n: nsecs in u64 + * + * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. + * And this doesn't return MAX_JIFFY_OFFSET since this function is designed + * for scheduler, not for use in device drivers to calculate timeout value. + * + * note: + * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) + * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years + */ +unsigned long nsecs_to_jiffies(u64 n) +{ + return (unsigned long)nsecs_to_jiffies64(n); +} + #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void) { From 70a89a6620f658d47a1488515bada4b8ee6291d8 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Tue, 21 Dec 2010 17:09:02 -0800 Subject: [PATCH 046/706] sched: Refactor account_system_time separating id-update Refactor account_system_time, to separate out the logic of identifying the update needed and code that does actual update. This is used by following patch for IRQ_TIME_ACCOUNTING, which has different identification logic and same update logic. Tested-by: Shaun Ruffell Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1292980144-28796-4-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 46 +++++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 8b89b3bba565..e3fa92106ed7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3567,6 +3567,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, } } +/* + * Account system cpu time to a process and desired cpustat field + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + * @target_cputime64: pointer to cpustat field that has to be updated + */ +static inline +void __account_system_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled, cputime64_t *target_cputime64) +{ + cputime64_t tmp = cputime_to_cputime64(cputime); + + /* Add system time to process. */ + p->stime = cputime_add(p->stime, cputime); + p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); + account_group_system_time(p, cputime); + + /* Add system time to cpustat. */ + *target_cputime64 = cputime64_add(*target_cputime64, tmp); + cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); + + /* Account for system time used */ + acct_update_integrals(p); +} + /* * Account system cpu time to a process. * @p: the process that the cpu time gets accounted to @@ -3578,31 +3604,21 @@ void account_system_time(struct task_struct *p, int hardirq_offset, cputime_t cputime, cputime_t cputime_scaled) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t tmp; + cputime64_t *target_cputime64; if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { account_guest_time(p, cputime, cputime_scaled); return; } - /* Add system time to process. */ - p->stime = cputime_add(p->stime, cputime); - p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); - account_group_system_time(p, cputime); - - /* Add system time to cpustat. */ - tmp = cputime_to_cputime64(cputime); if (hardirq_count() - hardirq_offset) - cpustat->irq = cputime64_add(cpustat->irq, tmp); + target_cputime64 = &cpustat->irq; else if (in_serving_softirq()) - cpustat->softirq = cputime64_add(cpustat->softirq, tmp); + target_cputime64 = &cpustat->softirq; else - cpustat->system = cputime64_add(cpustat->system, tmp); + target_cputime64 = &cpustat->system; - cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); - - /* Account for system time used */ - acct_update_integrals(p); + __account_system_time(p, cputime, cputime_scaled, target_cputime64); } /* From abb74cefa9c682fb38ba86c17ca3c86fed6cc464 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Tue, 21 Dec 2010 17:09:03 -0800 Subject: [PATCH 047/706] sched: Export ns irqtimes through /proc/stat CONFIG_IRQ_TIME_ACCOUNTING adds ns granularity irq time on each CPU. This info is already used in scheduler to do proper task chargeback (earlier patches). This patch retro-fits this ns granularity hardirq and softirq information to /proc/stat irq and softirq fields. The update is still done on timer tick, where we look at accumulated ns hardirq/softirq time and account the tick to user/system/irq/hardirq/guest accordingly. No new interface added. Earlier versions looked at adding this as new fields in some /proc files. This one seems to be the best in terms of impact to existing apps, even though it has somewhat more kernel code than earlier versions. Tested-by: Shaun Ruffell Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1292980144-28796-5-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 102 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/kernel/sched.c b/kernel/sched.c index e3fa92106ed7..2a3c9799d76b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1920,8 +1920,40 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) sched_rt_avg_update(rq, irq_delta); } +static int irqtime_account_hi_update(void) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + unsigned long flags; + u64 latest_ns; + int ret = 0; + + local_irq_save(flags); + latest_ns = this_cpu_read(cpu_hardirq_time); + if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) + ret = 1; + local_irq_restore(flags); + return ret; +} + +static int irqtime_account_si_update(void) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + unsigned long flags; + u64 latest_ns; + int ret = 0; + + local_irq_save(flags); + latest_ns = this_cpu_read(cpu_softirq_time); + if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) + ret = 1; + local_irq_restore(flags); + return ret; +} + #else /* CONFIG_IRQ_TIME_ACCOUNTING */ +#define sched_clock_irqtime (0) + static void update_rq_clock_task(struct rq *rq, s64 delta) { rq->clock_task += delta; @@ -3621,6 +3653,65 @@ void account_system_time(struct task_struct *p, int hardirq_offset, __account_system_time(p, cputime, cputime_scaled, target_cputime64); } +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +/* + * Account a tick to a process and cpustat + * @p: the process that the cpu time gets accounted to + * @user_tick: is the tick from userspace + * @rq: the pointer to rq + * + * Tick demultiplexing follows the order + * - pending hardirq update + * - pending softirq update + * - user_time + * - idle_time + * - system time + * - check for guest_time + * - else account as system_time + * + * Check for hardirq is done both for system and user time as there is + * no timer going off while we are on hardirq and hence we may never get an + * opportunity to update it solely in system time. + * p->stime and friends are only updated on system time and not on irq + * softirq as those do not count in task exec_runtime any more. + */ +static void irqtime_account_process_tick(struct task_struct *p, int user_tick, + struct rq *rq) +{ + cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + + if (irqtime_account_hi_update()) { + cpustat->irq = cputime64_add(cpustat->irq, tmp); + } else if (irqtime_account_si_update()) { + cpustat->softirq = cputime64_add(cpustat->softirq, tmp); + } else if (user_tick) { + account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); + } else if (p == rq->idle) { + account_idle_time(cputime_one_jiffy); + } else if (p->flags & PF_VCPU) { /* System time or guest time */ + account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); + } else { + __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, + &cpustat->system); + } +} + +static void irqtime_account_idle_ticks(int ticks) +{ + int i; + struct rq *rq = this_rq(); + + for (i = 0; i < ticks; i++) + irqtime_account_process_tick(current, 0, rq); +} +#else +static void irqtime_account_idle_ticks(int ticks) {} +static void irqtime_account_process_tick(struct task_struct *p, int user_tick, + struct rq *rq) {} +#endif + /* * Account for involuntary wait time. * @steal: the cpu time spent in involuntary wait @@ -3661,6 +3752,11 @@ void account_process_tick(struct task_struct *p, int user_tick) cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); struct rq *rq = this_rq(); + if (sched_clock_irqtime) { + irqtime_account_process_tick(p, user_tick, rq); + return; + } + if (user_tick) account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) @@ -3686,6 +3782,12 @@ void account_steal_ticks(unsigned long ticks) */ void account_idle_ticks(unsigned long ticks) { + + if (sched_clock_irqtime) { + irqtime_account_idle_ticks(ticks); + return; + } + account_idle_time(jiffies_to_cputime(ticks)); } From 414bee9ba613adb3804965e2d84db32d0599f9c6 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Tue, 21 Dec 2010 17:09:04 -0800 Subject: [PATCH 048/706] softirqs: Account ksoftirqd time as cpustat softirq softirq time in ksoftirqd context is not accounted in ns granularity per cpu softirq stats, as we want that to be a part of ksoftirqd exec_runtime. Accounting them as softirq on /proc/stat separately. Tested-by: Shaun Ruffell Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1292980144-28796-6-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/sched.c b/kernel/sched.c index 2a3c9799d76b..8b718b59b09f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3686,6 +3686,14 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, cpustat->irq = cputime64_add(cpustat->irq, tmp); } else if (irqtime_account_si_update()) { cpustat->softirq = cputime64_add(cpustat->softirq, tmp); + } else if (this_cpu_ksoftirqd() == p) { + /* + * ksoftirqd time do not get accounted in cpu_softirq_time. + * So, we have to handle it separately here. + * Also, p->stime needs to be updated for ksoftirqd. + */ + __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, + &cpustat->softirq); } else if (user_tick) { account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); } else if (p == rq->idle) { From a8941d7ec81678fb69aea7183338175f112f3e0d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 25 Jan 2011 16:30:03 +0100 Subject: [PATCH 049/706] sched: Simplify the idle scheduling class Since commit 48c5ccae88dcd (sched: Simplify cpu-hot-unplug task migration) this should no longer happen, so remove the code. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_idletask.c | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 9fa0f402c87c..41eb62a0808b 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -52,31 +52,16 @@ static void set_curr_task_idle(struct rq *rq) { } -static void switched_to_idle(struct rq *rq, struct task_struct *p, - int running) +static void +switched_to_idle(struct rq *rq, struct task_struct *p, int running) { - /* Can this actually happen?? */ - if (running) - resched_task(rq->curr); - else - check_preempt_curr(rq, p, 0); + BUG(); } static void prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio, int running) { - /* This can happen for hot plug CPUS */ - - /* - * Reschedule if we are currently running on this runqueue and - * our priority decreased, or if we are not currently running on - * this runqueue and our priority is higher than the current's - */ - if (running) { - if (p->prio > oldprio) - resched_task(rq->curr); - } else - check_preempt_curr(rq, p, 0); + BUG(); } static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) From da7a735e51f9622eb3e1672594d4a41da01d7e4f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 17 Jan 2011 17:03:27 +0100 Subject: [PATCH 050/706] sched: Fix switch_from_fair() When a task is taken out of the fair class we must ensure the vruntime is properly normalized because when we put it back in it will assume to be normalized. The case that goes wrong is when changing away from the fair class while sleeping. Sleeping tasks have non-normalized vruntime in order to make sleeper-fairness work. So treat the switch away from fair as a wakeup and preserve the relative vruntime. Also update sysrq-n to call the ->switch_{to,from} methods. Reported-by: Onkalo Samu Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/sched.h | 8 +++----- kernel/sched.c | 25 +++++++++++++----------- kernel/sched_fair.c | 42 +++++++++++++++++++++++++++++++++++------ kernel/sched_idletask.c | 7 +++---- kernel/sched_rt.c | 19 ++++++++++--------- kernel/sched_stoptask.c | 7 +++---- 6 files changed, 69 insertions(+), 39 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index af6e15fbfb78..0542774914d4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1084,12 +1084,10 @@ struct sched_class { void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); void (*task_fork) (struct task_struct *p); - void (*switched_from) (struct rq *this_rq, struct task_struct *task, - int running); - void (*switched_to) (struct rq *this_rq, struct task_struct *task, - int running); + void (*switched_from) (struct rq *this_rq, struct task_struct *task); + void (*switched_to) (struct rq *this_rq, struct task_struct *task); void (*prio_changed) (struct rq *this_rq, struct task_struct *task, - int oldprio, int running); + int oldprio); unsigned int (*get_rr_interval) (struct rq *rq, struct task_struct *task); diff --git a/kernel/sched.c b/kernel/sched.c index 8b718b59b09f..78fa75394011 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2057,14 +2057,14 @@ inline int task_curr(const struct task_struct *p) static inline void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, - int oldprio, int running) + int oldprio) { if (prev_class != p->sched_class) { if (prev_class->switched_from) - prev_class->switched_from(rq, p, running); - p->sched_class->switched_to(rq, p, running); - } else - p->sched_class->prio_changed(rq, p, oldprio, running); + prev_class->switched_from(rq, p); + p->sched_class->switched_to(rq, p); + } else if (oldprio != p->prio) + p->sched_class->prio_changed(rq, p, oldprio); } static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) @@ -2598,6 +2598,7 @@ static void __sched_fork(struct task_struct *p) p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; + p->se.vruntime = 0; #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); @@ -4696,11 +4697,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (running) p->sched_class->set_curr_task(rq); - if (on_rq) { + if (on_rq) enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); - check_class_changed(rq, p, prev_class, oldprio, running); - } + check_class_changed(rq, p, prev_class, oldprio); task_rq_unlock(rq, &flags); } @@ -5028,11 +5028,10 @@ recheck: if (running) p->sched_class->set_curr_task(rq); - if (on_rq) { + if (on_rq) activate_task(rq, p, 0); - check_class_changed(rq, p, prev_class, oldprio, running); - } + check_class_changed(rq, p, prev_class, oldprio); __task_rq_unlock(rq); raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -8237,6 +8236,8 @@ EXPORT_SYMBOL(__might_sleep); #ifdef CONFIG_MAGIC_SYSRQ static void normalize_task(struct rq *rq, struct task_struct *p) { + const struct sched_class *prev_class = p->sched_class; + int old_prio = p->prio; int on_rq; on_rq = p->se.on_rq; @@ -8247,6 +8248,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p) activate_task(rq, p, 0); resched_task(rq->curr); } + + check_class_changed(rq, p, prev_class, old_prio); } void normalize_rt_tasks(void) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4cbc9121094c..55040f3938d8 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -4078,33 +4078,62 @@ static void task_fork_fair(struct task_struct *p) * Priority of the task has changed. Check to see if we preempt * the current task. */ -static void prio_changed_fair(struct rq *rq, struct task_struct *p, - int oldprio, int running) +static void +prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) { + if (!p->se.on_rq) + return; + /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (running) { + if (rq->curr == p) { if (p->prio > oldprio) resched_task(rq->curr); } else check_preempt_curr(rq, p, 0); } +static void switched_from_fair(struct rq *rq, struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + /* + * Ensure the task's vruntime is normalized, so that when its + * switched back to the fair class the enqueue_entity(.flags=0) will + * do the right thing. + * + * If it was on_rq, then the dequeue_entity(.flags=0) will already + * have normalized the vruntime, if it was !on_rq, then only when + * the task is sleeping will it still have non-normalized vruntime. + */ + if (!se->on_rq && p->state != TASK_RUNNING) { + /* + * Fix up our vruntime so that the current sleep doesn't + * cause 'unlimited' sleep bonus. + */ + place_entity(cfs_rq, se, 0); + se->vruntime -= cfs_rq->min_vruntime; + } +} + /* * We switched to the sched_fair class. */ -static void switched_to_fair(struct rq *rq, struct task_struct *p, - int running) +static void switched_to_fair(struct rq *rq, struct task_struct *p) { + if (!p->se.on_rq) + return; + /* * We were most likely switched from sched_rt, so * kick off the schedule if running, otherwise just see * if we can still preempt the current task. */ - if (running) + if (rq->curr == p) resched_task(rq->curr); else check_preempt_curr(rq, p, 0); @@ -4190,6 +4219,7 @@ static const struct sched_class fair_sched_class = { .task_fork = task_fork_fair, .prio_changed = prio_changed_fair, + .switched_from = switched_from_fair, .switched_to = switched_to_fair, .get_rr_interval = get_rr_interval_fair, diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 41eb62a0808b..c82f26c1b7c3 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -52,14 +52,13 @@ static void set_curr_task_idle(struct rq *rq) { } -static void -switched_to_idle(struct rq *rq, struct task_struct *p, int running) +static void switched_to_idle(struct rq *rq, struct task_struct *p) { BUG(); } -static void prio_changed_idle(struct rq *rq, struct task_struct *p, - int oldprio, int running) +static void +prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) { BUG(); } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index c914ec747ca6..c381fdc18c64 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1595,8 +1595,7 @@ static void rq_offline_rt(struct rq *rq) * When switch from the rt queue, we bring ourselves to a position * that we might want to pull RT tasks from other runqueues. */ -static void switched_from_rt(struct rq *rq, struct task_struct *p, - int running) +static void switched_from_rt(struct rq *rq, struct task_struct *p) { /* * If there are other RT tasks then we will reschedule @@ -1605,7 +1604,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p, * we may need to handle the pulling of RT tasks * now. */ - if (!rq->rt.rt_nr_running) + if (p->se.on_rq && !rq->rt.rt_nr_running) pull_rt_task(rq); } @@ -1624,8 +1623,7 @@ static inline void init_sched_rt_class(void) * with RT tasks. In this case we try to push them off to * other runqueues. */ -static void switched_to_rt(struct rq *rq, struct task_struct *p, - int running) +static void switched_to_rt(struct rq *rq, struct task_struct *p) { int check_resched = 1; @@ -1636,7 +1634,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p, * If that current running task is also an RT task * then see if we can move to another run queue. */ - if (!running) { + if (p->se.on_rq && rq->curr != p) { #ifdef CONFIG_SMP if (rq->rt.overloaded && push_rt_task(rq) && /* Don't resched if we changed runqueues */ @@ -1652,10 +1650,13 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p, * Priority of the task has changed. This may cause * us to initiate a push or pull. */ -static void prio_changed_rt(struct rq *rq, struct task_struct *p, - int oldprio, int running) +static void +prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) { - if (running) { + if (!p->se.on_rq) + return; + + if (rq->curr == p) { #ifdef CONFIG_SMP /* * If our priority decreases while running, we diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index 2bf6b47058c1..84ec9bcf82d9 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c @@ -59,14 +59,13 @@ static void set_curr_task_stop(struct rq *rq) { } -static void switched_to_stop(struct rq *rq, struct task_struct *p, - int running) +static void switched_to_stop(struct rq *rq, struct task_struct *p) { BUG(); /* its impossible to change to this class */ } -static void prio_changed_stop(struct rq *rq, struct task_struct *p, - int oldprio, int running) +static void +prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio) { BUG(); /* how!?, what priority? */ } From ccaa8d657117bb1876d471bd91579d774106778d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 25 Jan 2011 23:17:32 +0100 Subject: [PATCH 051/706] rtmutex-tester: Remove BKL tests The BKL is going away, no need to test it any more. I left the definitions of the test case numbers in, so that the other tests do not get renumbered. Signed-off-by: Arnd Bergmann Cc: Arjan van de Ven Cc: Ingo Molnar Cc: Andrew Morton LKML-Reference: <1295993854-4971-19-git-send-email-arnd@arndb.de> Signed-off-by: Thomas Gleixner --- kernel/rtmutex-tester.c | 39 ++++----------------------------------- 1 file changed, 4 insertions(+), 35 deletions(-) diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 66cb89bc5ef1..d5b543506cbc 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -27,7 +26,6 @@ struct test_thread_data { int opcode; int opdata; int mutexes[MAX_RT_TEST_MUTEXES]; - int bkl; int event; struct sys_device sysdev; }; @@ -46,8 +44,8 @@ enum test_opcodes { RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ - RTTEST_LOCKBKL, /* 9 Lock BKL */ - RTTEST_UNLOCKBKL, /* 10 Unlock BKL */ + RTTEST_LOCKBKL, /* 9 Was: Lock BKL */ + RTTEST_UNLOCKBKL, /* 10 Was: Unlock BKL */ RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */ RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ RTTEST_RESET = 99, /* 99 Reset all pending operations */ @@ -74,13 +72,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup) td->mutexes[i] = 0; } } - - if (!lockwakeup && td->bkl == 4) { -#ifdef CONFIG_LOCK_KERNEL - unlock_kernel(); -#endif - td->bkl = 0; - } return 0; case RTTEST_RESETEVENT: @@ -131,25 +122,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup) td->mutexes[id] = 0; return 0; - case RTTEST_LOCKBKL: - if (td->bkl) - return 0; - td->bkl = 1; -#ifdef CONFIG_LOCK_KERNEL - lock_kernel(); -#endif - td->bkl = 4; - return 0; - - case RTTEST_UNLOCKBKL: - if (td->bkl != 4) - break; -#ifdef CONFIG_LOCK_KERNEL - unlock_kernel(); -#endif - td->bkl = 0; - return 0; - default: break; } @@ -196,7 +168,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex) td->event = atomic_add_return(1, &rttest_event); break; - case RTTEST_LOCKBKL: default: break; } @@ -229,8 +200,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex) td->event = atomic_add_return(1, &rttest_event); return; - case RTTEST_LOCKBKL: - return; default: return; } @@ -380,11 +349,11 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute spin_lock(&rttest_lock); curr += sprintf(curr, - "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:", + "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:", td->opcode, td->event, tsk->state, (MAX_RT_PRIO - 1) - tsk->prio, (MAX_RT_PRIO - 1) - tsk->normal_prio, - tsk->pi_blocked_on, td->bkl); + tsk->pi_blocked_on); for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) curr += sprintf(curr, "%d", td->mutexes[i]); From 61bb46082775acd18c712607615a8b7dbeff7873 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 23 Jan 2011 15:10:51 +0100 Subject: [PATCH 052/706] alpha: Replace deprecated spinlock initialization SPIN_LOCK_UNLOCK is deprecated. Use the lockdep capable variant instead. Signed-off-by: Thomas Gleixner Cc: Matt Turner --- arch/alpha/include/asm/rwsem.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h index 1570c0b54336..19c9cc7ed94b 100644 --- a/arch/alpha/include/asm/rwsem.h +++ b/arch/alpha/include/asm/rwsem.h @@ -39,7 +39,7 @@ struct rw_semaphore { }; #define __RWSEM_INITIALIZER(name) \ - { RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \ + { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \ LIST_HEAD_INIT((name).wait_list) } #define DECLARE_RWSEM(name) \ From e41c8ab174f47ed5ed10a365482d0d7b0e352beb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 23 Jan 2011 15:14:15 +0100 Subject: [PATCH 053/706] cris: Replace deprecated spinlock initialization SPIN_LOCK_UNLOCK is deprecated. Use the lockdep capable variant instead. Signed-off-by: Thomas Gleixner Cc: Jesper Nilsson --- arch/cris/arch-v32/kernel/smp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c index 84fed3b4b079..4c9e3e1ba5d1 100644 --- a/arch/cris/arch-v32/kernel/smp.c +++ b/arch/cris/arch-v32/kernel/smp.c @@ -26,7 +26,9 @@ #define FLUSH_ALL (void*)0xffffffff /* Vector of locks used for various atomic operations */ -spinlock_t cris_atomic_locks[] = { [0 ... LOCK_COUNT - 1] = SPIN_LOCK_UNLOCKED}; +spinlock_t cris_atomic_locks[] = { + [0 ... LOCK_COUNT - 1] = __SPIN_LOCK_UNLOCKED(cris_atomic_locks) +}; /* CPU masks */ cpumask_t phys_cpu_present_map = CPU_MASK_NONE; From 7424cdf77b1b2975d82619084f20f0055f715166 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 23 Jan 2011 15:16:35 +0100 Subject: [PATCH 054/706] mips: Replace deprecated spinlock initialization SPIN_LOCK_UNLOCK is deprecated. Use the lockdep capable variant instead. Signed-off-by: Thomas Gleixner Cc: Ralf Baechle --- arch/mips/kernel/vpe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/kernel/vpe.c b/arch/mips/kernel/vpe.c index 6a1fdfef8fde..ab52b7cf3b6b 100644 --- a/arch/mips/kernel/vpe.c +++ b/arch/mips/kernel/vpe.c @@ -148,9 +148,9 @@ struct { spinlock_t tc_list_lock; struct list_head tc_list; /* Thread contexts */ } vpecontrol = { - .vpe_list_lock = SPIN_LOCK_UNLOCKED, + .vpe_list_lock = __SPIN_LOCK_UNLOCKED(vpe_list_lock), .vpe_list = LIST_HEAD_INIT(vpecontrol.vpe_list), - .tc_list_lock = SPIN_LOCK_UNLOCKED, + .tc_list_lock = __SPIN_LOCK_UNLOCKED(tc_list_lock), .tc_list = LIST_HEAD_INIT(vpecontrol.tc_list) }; From 24774fbdeab8f6ac05a19e81bd645b0f7e5d2bb7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 23 Jan 2011 15:19:12 +0100 Subject: [PATCH 055/706] sparc: Replace deprecated spinlock initialization SPIN_LOCK_UNLOCK is deprecated. Use the lockdep capable variant instead. Signed-off-by: Thomas Gleixner Acked-by: David S. Miller --- arch/sparc/lib/atomic32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sparc/lib/atomic32.c b/arch/sparc/lib/atomic32.c index cbddeb38ffda..d3c7a12ad879 100644 --- a/arch/sparc/lib/atomic32.c +++ b/arch/sparc/lib/atomic32.c @@ -16,7 +16,7 @@ #define ATOMIC_HASH(a) (&__atomic_hash[(((unsigned long)a)>>8) & (ATOMIC_HASH_SIZE-1)]) spinlock_t __atomic_hash[ATOMIC_HASH_SIZE] = { - [0 ... (ATOMIC_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED + [0 ... (ATOMIC_HASH_SIZE-1)] = __SPIN_LOCK_UNLOCKED(__atomic_hash) }; #else /* SMP */ From 22e650045899011b028e40625bc73df9f3260bac Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 23 Jan 2011 15:21:25 +0100 Subject: [PATCH 056/706] um: Replace deprecated spinlock initialization SPIN_LOCK_UNLOCK is deprecated. Use the lockdep capable variant instead. Signed-off-by: Thomas Gleixner Cc: Jeff Dike --- arch/um/drivers/ubd_kern.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index ba4a98ba39c0..620f5b70957d 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -185,7 +185,7 @@ struct ubd { .no_cow = 0, \ .shared = 0, \ .cow = DEFAULT_COW, \ - .lock = SPIN_LOCK_UNLOCKED, \ + .lock = __SPIN_LOCK_UNLOCKED(ubd_devs.lock), \ .request = NULL, \ .start_sg = 0, \ .end_sg = 0, \ From 235454c99851ba21038061b9acf38d1a636068c5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 23 Jan 2011 15:23:14 +0100 Subject: [PATCH 057/706] xtensa: Replace deprecated spinlock initialization SPIN_LOCK_UNLOCK is deprecated. Use the lockdep capable variant instead. Signed-off-by: Thomas Gleixner Cc: Chris Zankel --- arch/xtensa/include/asm/rwsem.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/xtensa/include/asm/rwsem.h b/arch/xtensa/include/asm/rwsem.h index e39edf5c86f2..9d32f6874393 100644 --- a/arch/xtensa/include/asm/rwsem.h +++ b/arch/xtensa/include/asm/rwsem.h @@ -38,7 +38,7 @@ struct rw_semaphore { }; #define __RWSEM_INITIALIZER(name) \ - { RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \ + { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \ LIST_HEAD_INIT((name).wait_list) } #define DECLARE_RWSEM(name) \ From 92578c0b8078f6919f9b47e7e16a1cf770bd127b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 23 Jan 2011 15:24:55 +0100 Subject: [PATCH 058/706] kthread: Replace deprecated spinlock initialization SPIN_LOCK_UNLOCK is deprecated. Use the lockdep capable variant instead. Signed-off-by: Thomas Gleixner --- include/linux/kthread.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/kthread.h b/include/linux/kthread.h index ce0775aa64c3..7ff16f7d3ed4 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -64,7 +64,7 @@ struct kthread_work { }; #define KTHREAD_WORKER_INIT(worker) { \ - .lock = SPIN_LOCK_UNLOCKED, \ + .lock = __SPIN_LOCK_UNLOCKED((worker).lock), \ .work_list = LIST_HEAD_INIT((worker).work_list), \ } From 10389a15e25fd4784d42de7e0e3fc8c242f2011d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 23 Jan 2011 15:25:56 +0100 Subject: [PATCH 059/706] cred: Replace deprecated spinlock initialization SPIN_LOCK_UNLOCK is deprecated. Use the lockdep capable variant instead. Signed-off-by: Thomas Gleixner --- kernel/cred.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cred.c b/kernel/cred.c index 6a1aa004e376..b5496e81b0f7 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -35,7 +35,7 @@ static struct kmem_cache *cred_jar; static struct thread_group_cred init_tgcred = { .usage = ATOMIC_INIT(2), .tgid = 0, - .lock = SPIN_LOCK_UNLOCKED, + .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock), }; #endif From d04fa5a3ba06c3b7a1c4a6860d0fa4825507a755 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 23 Jan 2011 15:30:09 +0100 Subject: [PATCH 060/706] locking: Remove deprecated lock initializers Last users are gone. Remove the left overs. Signed-off-by: Thomas Gleixner --- Documentation/spinlocks.txt | 24 +----------------------- include/linux/rwlock_types.h | 8 -------- include/linux/spinlock_types.h | 8 -------- scripts/checkpatch.pl | 5 ----- 4 files changed, 1 insertion(+), 44 deletions(-) diff --git a/Documentation/spinlocks.txt b/Documentation/spinlocks.txt index 178c831b907d..2e3c64b1a6a5 100644 --- a/Documentation/spinlocks.txt +++ b/Documentation/spinlocks.txt @@ -86,7 +86,7 @@ to change the variables it has to get an exclusive write lock. The routines look the same as above: - rwlock_t xxx_lock = RW_LOCK_UNLOCKED; + rwlock_t xxx_lock = __RW_LOCK_UNLOCKED(xxx_lock); unsigned long flags; @@ -196,25 +196,3 @@ appropriate: For static initialization, use DEFINE_SPINLOCK() / DEFINE_RWLOCK() or __SPIN_LOCK_UNLOCKED() / __RW_LOCK_UNLOCKED() as appropriate. - -SPIN_LOCK_UNLOCKED and RW_LOCK_UNLOCKED are deprecated. These interfere -with lockdep state tracking. - -Most of the time, you can simply turn: - static spinlock_t xxx_lock = SPIN_LOCK_UNLOCKED; -into: - static DEFINE_SPINLOCK(xxx_lock); - -Static structure member variables go from: - - struct foo bar { - .lock = SPIN_LOCK_UNLOCKED; - }; - -to: - - struct foo bar { - .lock = __SPIN_LOCK_UNLOCKED(bar.lock); - }; - -Declaration of static rw_locks undergo a similar transformation. diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h index bd31808c7d8e..cc0072e93e36 100644 --- a/include/linux/rwlock_types.h +++ b/include/linux/rwlock_types.h @@ -43,14 +43,6 @@ typedef struct { RW_DEP_MAP_INIT(lockname) } #endif -/* - * RW_LOCK_UNLOCKED defeat lockdep state tracking and is hence - * deprecated. - * - * Please use DEFINE_RWLOCK() or __RW_LOCK_UNLOCKED() as appropriate. - */ -#define RW_LOCK_UNLOCKED __RW_LOCK_UNLOCKED(old_style_rw_init) - #define DEFINE_RWLOCK(x) rwlock_t x = __RW_LOCK_UNLOCKED(x) #endif /* __LINUX_RWLOCK_TYPES_H */ diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h index 851b7783720d..73548eb13a5d 100644 --- a/include/linux/spinlock_types.h +++ b/include/linux/spinlock_types.h @@ -81,14 +81,6 @@ typedef struct spinlock { #define __SPIN_LOCK_UNLOCKED(lockname) \ (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname) -/* - * SPIN_LOCK_UNLOCKED defeats lockdep state tracking and is hence - * deprecated. - * Please use DEFINE_SPINLOCK() or __SPIN_LOCK_UNLOCKED() as - * appropriate. - */ -#define SPIN_LOCK_UNLOCKED __SPIN_LOCK_UNLOCKED(old_style_spin_init) - #define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x) #include diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 4c0383da1c9a..58848e3e392c 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2654,11 +2654,6 @@ sub process { WARN("Use of volatile is usually wrong: see Documentation/volatile-considered-harmful.txt\n" . $herecurr); } -# SPIN_LOCK_UNLOCKED & RW_LOCK_UNLOCKED are deprecated - if ($line =~ /\b(SPIN_LOCK_UNLOCKED|RW_LOCK_UNLOCKED)/) { - ERROR("Use of $1 is deprecated: see Documentation/spinlocks.txt\n" . $herecurr); - } - # warn about #if 0 if ($line =~ /^.\s*\#\s*if\s+0\b/) { CHK("if this code is redundant consider removing it\n" . From c16a87ce063f79e0ec7d25ce2950e1bc6db03c72 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 26 Jan 2011 20:05:50 +0000 Subject: [PATCH 061/706] rwsem: Cleanup includes All rwsem implementations include the same headers. Include them from include/linux/rwsem.h Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: David Howells Cc: Benjamin Herrenschmidt Cc: Matt Turner Acked-by: Tony Luck Acked-by: Heiko Carstens Cc: Paul Mundt Acked-by: David Miller Cc: Chris Zankel LKML-Reference: <20110126195833.483520950@linutronix.de> --- arch/alpha/include/asm/rwsem.h | 4 ---- arch/ia64/include/asm/rwsem.h | 3 --- arch/powerpc/include/asm/rwsem.h | 5 ----- arch/s390/include/asm/rwsem.h | 5 ----- arch/sh/include/asm/rwsem.h | 5 ----- arch/sparc/include/asm/rwsem.h | 6 ------ arch/x86/include/asm/rwsem.h | 6 ------ arch/xtensa/include/asm/rwsem.h | 6 ------ include/linux/rwsem-spinlock.h | 8 -------- include/linux/rwsem.h | 3 +++ 10 files changed, 3 insertions(+), 48 deletions(-) diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h index 19c9cc7ed94b..839a3fa20944 100644 --- a/arch/alpha/include/asm/rwsem.h +++ b/arch/alpha/include/asm/rwsem.h @@ -13,10 +13,6 @@ #ifdef __KERNEL__ #include -#include -#include - -struct rwsem_waiter; extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h index 215d5454c7d3..9bcf0792b01c 100644 --- a/arch/ia64/include/asm/rwsem.h +++ b/arch/ia64/include/asm/rwsem.h @@ -25,9 +25,6 @@ #error "Please don't include directly, use instead." #endif -#include -#include - #include /* diff --git a/arch/powerpc/include/asm/rwsem.h b/arch/powerpc/include/asm/rwsem.h index 8447d89fbe72..c12abbe1d06f 100644 --- a/arch/powerpc/include/asm/rwsem.h +++ b/arch/powerpc/include/asm/rwsem.h @@ -13,11 +13,6 @@ * by Paul Mackerras . */ -#include -#include -#include -#include - /* * the semaphore definition */ diff --git a/arch/s390/include/asm/rwsem.h b/arch/s390/include/asm/rwsem.h index 423fdda2322d..9cc8592b33bb 100644 --- a/arch/s390/include/asm/rwsem.h +++ b/arch/s390/include/asm/rwsem.h @@ -43,11 +43,6 @@ #ifdef __KERNEL__ -#include -#include - -struct rwsem_waiter; - extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *); extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *); extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *); diff --git a/arch/sh/include/asm/rwsem.h b/arch/sh/include/asm/rwsem.h index 06e2251a5e48..df6f34623c54 100644 --- a/arch/sh/include/asm/rwsem.h +++ b/arch/sh/include/asm/rwsem.h @@ -11,11 +11,6 @@ #endif #ifdef __KERNEL__ -#include -#include -#include -#include - /* * the semaphore definition */ diff --git a/arch/sparc/include/asm/rwsem.h b/arch/sparc/include/asm/rwsem.h index a2b4302869bc..902d36bf150d 100644 --- a/arch/sparc/include/asm/rwsem.h +++ b/arch/sparc/include/asm/rwsem.h @@ -12,12 +12,6 @@ #endif #ifdef __KERNEL__ - -#include -#include - -struct rwsem_waiter; - struct rw_semaphore { signed long count; #define RWSEM_UNLOCKED_VALUE 0x00000000L diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index d1e41b0f9b60..a626cff86041 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -37,14 +37,8 @@ #endif #ifdef __KERNEL__ - -#include -#include -#include #include -struct rwsem_waiter; - extern asmregparm struct rw_semaphore * rwsem_down_read_failed(struct rw_semaphore *sem); extern asmregparm struct rw_semaphore * diff --git a/arch/xtensa/include/asm/rwsem.h b/arch/xtensa/include/asm/rwsem.h index 9d32f6874393..1be2102c648e 100644 --- a/arch/xtensa/include/asm/rwsem.h +++ b/arch/xtensa/include/asm/rwsem.h @@ -16,12 +16,6 @@ #ifndef _LINUX_RWSEM_H #error "Please don't include directly, use instead." #endif - -#include -#include -#include -#include - /* * the semaphore definition */ diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h index bdfcc2527970..8c0dc7fc07a4 100644 --- a/include/linux/rwsem-spinlock.h +++ b/include/linux/rwsem-spinlock.h @@ -12,15 +12,7 @@ #error "please don't include linux/rwsem-spinlock.h directly, use linux/rwsem.h instead" #endif -#include -#include - #ifdef __KERNEL__ - -#include - -struct rwsem_waiter; - /* * the rw-semaphore definition * - if activity is 0 then there are no active readers or writers diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index efd348fe8ca7..496296d12d62 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -11,6 +11,9 @@ #include #include +#include +#include + #include #include From bde11efbc21ea84c3351464a422b467eaefabb9a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 26 Jan 2011 20:05:53 +0000 Subject: [PATCH 062/706] x86: Cleanup rwsem_count_t typedef Remove the typedef which has no real reason to be there. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: David Howells Cc: Benjamin Herrenschmidt Cc: Matt Turner Cc: Tony Luck Cc: Heiko Carstens Cc: Paul Mundt Cc: David Miller Cc: Chris Zankel LKML-Reference: <20110126195833.580335506@linutronix.de> --- arch/x86/include/asm/rwsem.h | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index a626cff86041..c30206c2bbf9 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -68,10 +68,8 @@ extern asmregparm struct rw_semaphore * #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) -typedef signed long rwsem_count_t; - struct rw_semaphore { - rwsem_count_t count; + long count; spinlock_t wait_lock; struct list_head wait_list; #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -127,7 +125,7 @@ static inline void __down_read(struct rw_semaphore *sem) */ static inline int __down_read_trylock(struct rw_semaphore *sem) { - rwsem_count_t result, tmp; + long result, tmp; asm volatile("# beginning __down_read_trylock\n\t" " mov %0,%1\n\t" "1:\n\t" @@ -149,7 +147,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) */ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) { - rwsem_count_t tmp; + long tmp; asm volatile("# beginning down_write\n\t" LOCK_PREFIX " xadd %1,(%2)\n\t" /* adds 0xffff0001, returns the old value */ @@ -174,9 +172,8 @@ static inline void __down_write(struct rw_semaphore *sem) */ static inline int __down_write_trylock(struct rw_semaphore *sem) { - rwsem_count_t ret = cmpxchg(&sem->count, - RWSEM_UNLOCKED_VALUE, - RWSEM_ACTIVE_WRITE_BIAS); + long ret = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, + RWSEM_ACTIVE_WRITE_BIAS); if (ret == RWSEM_UNLOCKED_VALUE) return 1; return 0; @@ -187,7 +184,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) */ static inline void __up_read(struct rw_semaphore *sem) { - rwsem_count_t tmp; + long tmp; asm volatile("# beginning __up_read\n\t" LOCK_PREFIX " xadd %1,(%2)\n\t" /* subtracts 1, returns the old value */ @@ -205,7 +202,7 @@ static inline void __up_read(struct rw_semaphore *sem) */ static inline void __up_write(struct rw_semaphore *sem) { - rwsem_count_t tmp; + long tmp; asm volatile("# beginning __up_write\n\t" LOCK_PREFIX " xadd %1,(%2)\n\t" /* subtracts 0xffff0001, returns the old value */ @@ -241,8 +238,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem) /* * implement atomic add functionality */ -static inline void rwsem_atomic_add(rwsem_count_t delta, - struct rw_semaphore *sem) +static inline void rwsem_atomic_add(long delta, struct rw_semaphore *sem) { asm volatile(LOCK_PREFIX _ASM_ADD "%1,%0" : "+m" (sem->count) @@ -252,10 +248,9 @@ static inline void rwsem_atomic_add(rwsem_count_t delta, /* * implement exchange and add functionality */ -static inline rwsem_count_t rwsem_atomic_update(rwsem_count_t delta, - struct rw_semaphore *sem) +static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem) { - rwsem_count_t tmp = delta; + long tmp = delta; asm volatile(LOCK_PREFIX "xadd %0,%1" : "+r" (tmp), "+m" (sem->count) From 1c8ed640d918290ddc1de5ada02ef6686a733c9f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 26 Jan 2011 20:05:56 +0000 Subject: [PATCH 063/706] rwsem: Move duplicate struct rwsem declaration to linux/rwsem.h The difference between these declarations is the data type of the count member and the lack of lockdep in some architectures/ long is equivivalent to signed long and the #ifdef guarded dep_map member does not hurt anyone. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: David Howells Cc: Benjamin Herrenschmidt Cc: Matt Turner Acked-by: Tony Luck Acked-by: Heiko Carstens Cc: Paul Mundt Acked-by: David Miller Cc: Chris Zankel LKML-Reference: <20110126195833.679641914@linutronix.de> Signed-off-by: Thomas Gleixner --- arch/alpha/include/asm/rwsem.h | 8 -------- arch/ia64/include/asm/rwsem.h | 9 --------- arch/powerpc/include/asm/rwsem.h | 9 --------- arch/s390/include/asm/rwsem.h | 12 ------------ arch/sh/include/asm/rwsem.h | 12 +----------- arch/sparc/include/asm/rwsem.h | 9 +-------- arch/x86/include/asm/rwsem.h | 12 ------------ arch/xtensa/include/asm/rwsem.h | 9 +-------- include/linux/rwsem.h | 13 ++++++++++++- 9 files changed, 15 insertions(+), 78 deletions(-) diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h index 839a3fa20944..3e5619ead29d 100644 --- a/arch/alpha/include/asm/rwsem.h +++ b/arch/alpha/include/asm/rwsem.h @@ -19,20 +19,12 @@ extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *); extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); -/* - * the semaphore definition - */ -struct rw_semaphore { - long count; #define RWSEM_UNLOCKED_VALUE 0x0000000000000000L #define RWSEM_ACTIVE_BIAS 0x0000000000000001L #define RWSEM_ACTIVE_MASK 0x00000000ffffffffL #define RWSEM_WAITING_BIAS (-0x0000000100000000L) #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - spinlock_t wait_lock; - struct list_head wait_list; -}; #define __RWSEM_INITIALIZER(name) \ { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \ diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h index 9bcf0792b01c..1fe465804dc7 100644 --- a/arch/ia64/include/asm/rwsem.h +++ b/arch/ia64/include/asm/rwsem.h @@ -27,15 +27,6 @@ #include -/* - * the semaphore definition - */ -struct rw_semaphore { - signed long count; - spinlock_t wait_lock; - struct list_head wait_list; -}; - #define RWSEM_UNLOCKED_VALUE __IA64_UL_CONST(0x0000000000000000) #define RWSEM_ACTIVE_BIAS (1L) #define RWSEM_ACTIVE_MASK (0xffffffffL) diff --git a/arch/powerpc/include/asm/rwsem.h b/arch/powerpc/include/asm/rwsem.h index c12abbe1d06f..bc1acc229223 100644 --- a/arch/powerpc/include/asm/rwsem.h +++ b/arch/powerpc/include/asm/rwsem.h @@ -28,15 +28,6 @@ #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) -struct rw_semaphore { - long count; - spinlock_t wait_lock; - struct list_head wait_list; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -}; - #ifdef CONFIG_DEBUG_LOCK_ALLOC # define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } #else diff --git a/arch/s390/include/asm/rwsem.h b/arch/s390/include/asm/rwsem.h index 9cc8592b33bb..6e075f1d97b4 100644 --- a/arch/s390/include/asm/rwsem.h +++ b/arch/s390/include/asm/rwsem.h @@ -49,18 +49,6 @@ extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *); extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *); extern struct rw_semaphore *rwsem_downgrade_write(struct rw_semaphore *); -/* - * the semaphore definition - */ -struct rw_semaphore { - signed long count; - spinlock_t wait_lock; - struct list_head wait_list; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -}; - #ifndef __s390x__ #define RWSEM_UNLOCKED_VALUE 0x00000000 #define RWSEM_ACTIVE_BIAS 0x00000001 diff --git a/arch/sh/include/asm/rwsem.h b/arch/sh/include/asm/rwsem.h index df6f34623c54..dffc62589f79 100644 --- a/arch/sh/include/asm/rwsem.h +++ b/arch/sh/include/asm/rwsem.h @@ -11,23 +11,13 @@ #endif #ifdef __KERNEL__ -/* - * the semaphore definition - */ -struct rw_semaphore { - long count; + #define RWSEM_UNLOCKED_VALUE 0x00000000 #define RWSEM_ACTIVE_BIAS 0x00000001 #define RWSEM_ACTIVE_MASK 0x0000ffff #define RWSEM_WAITING_BIAS (-0x00010000) #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - spinlock_t wait_lock; - struct list_head wait_list; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -}; #ifdef CONFIG_DEBUG_LOCK_ALLOC # define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } diff --git a/arch/sparc/include/asm/rwsem.h b/arch/sparc/include/asm/rwsem.h index 902d36bf150d..4c16d1de2ab5 100644 --- a/arch/sparc/include/asm/rwsem.h +++ b/arch/sparc/include/asm/rwsem.h @@ -12,20 +12,13 @@ #endif #ifdef __KERNEL__ -struct rw_semaphore { - signed long count; + #define RWSEM_UNLOCKED_VALUE 0x00000000L #define RWSEM_ACTIVE_BIAS 0x00000001L #define RWSEM_ACTIVE_MASK 0xffffffffL #define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1) #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - spinlock_t wait_lock; - struct list_head wait_list; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -}; #ifdef CONFIG_DEBUG_LOCK_ALLOC # define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index c30206c2bbf9..995cfe4c985d 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -49,8 +49,6 @@ extern asmregparm struct rw_semaphore * rwsem_downgrade_wake(struct rw_semaphore *sem); /* - * the semaphore definition - * * The bias values and the counter type limits the number of * potential readers/writers to 32767 for 32 bits and 2147483647 * for 64 bits. @@ -68,22 +66,12 @@ extern asmregparm struct rw_semaphore * #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) -struct rw_semaphore { - long count; - spinlock_t wait_lock; - struct list_head wait_list; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -}; - #ifdef CONFIG_DEBUG_LOCK_ALLOC # define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } #else # define __RWSEM_DEP_MAP_INIT(lockname) #endif - #define __RWSEM_INITIALIZER(name) \ { \ RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \ diff --git a/arch/xtensa/include/asm/rwsem.h b/arch/xtensa/include/asm/rwsem.h index 1be2102c648e..585cab9b0bdf 100644 --- a/arch/xtensa/include/asm/rwsem.h +++ b/arch/xtensa/include/asm/rwsem.h @@ -16,20 +16,13 @@ #ifndef _LINUX_RWSEM_H #error "Please don't include directly, use instead." #endif -/* - * the semaphore definition - */ -struct rw_semaphore { - signed long count; + #define RWSEM_UNLOCKED_VALUE 0x00000000 #define RWSEM_ACTIVE_BIAS 0x00000001 #define RWSEM_ACTIVE_MASK 0x0000ffff #define RWSEM_WAITING_BIAS (-0x00010000) #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - spinlock_t wait_lock; - struct list_head wait_list; -}; #define __RWSEM_INITIALIZER(name) \ { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \ diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 496296d12d62..e8be18edb664 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -22,7 +22,18 @@ struct rw_semaphore; #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK #include /* use a generic implementation */ #else -#include /* use an arch-specific implementation */ +/* All arch specific implementations share the same struct */ +struct rw_semaphore { + long count; + spinlock_t wait_lock; + struct list_head wait_list; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +/* Include the arch specific part */ +#include #endif /* From 12249b34414dba7f386aadcf6be7ca36c6878300 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 26 Jan 2011 20:06:00 +0000 Subject: [PATCH 064/706] rwsem: Move duplicate init macros and functions to linux/rwsem.h The rwsem initializers and related macros and functions are mostly the same. Some of them lack the lockdep initializer, but having it in place does not matter for architectures which do not support lockdep. powerpc, sparc, x86: No functional change sh, s390: Removes the duplicate init_rwsem (inline and #define) alpha, ia64, xtensa: Use the lockdep capable init function in lib/rwsem.c which is just uninlining the init function for the LOCKDEP=n case Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: David Howells Cc: Benjamin Herrenschmidt Cc: Matt Turner Acked-by: Tony Luck Acked-by: Heiko Carstens Cc: Paul Mundt Acked-by: David Miller Cc: Chris Zankel LKML-Reference: <20110126195833.771812729@linutronix.de> --- arch/alpha/include/asm/rwsem.h | 14 ------------- arch/ia64/include/asm/rwsem.h | 15 -------------- arch/powerpc/include/asm/rwsem.h | 27 ------------------------ arch/s390/include/asm/rwsem.h | 35 -------------------------------- arch/sh/include/asm/rwsem.h | 31 ---------------------------- arch/sparc/include/asm/rwsem.h | 23 --------------------- arch/x86/include/asm/rwsem.h | 25 ----------------------- arch/xtensa/include/asm/rwsem.h | 14 ------------- include/linux/rwsem-spinlock.h | 23 +-------------------- include/linux/rwsem.h | 25 +++++++++++++++++++++++ 10 files changed, 26 insertions(+), 206 deletions(-) diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h index 3e5619ead29d..c9f5b90e926e 100644 --- a/arch/alpha/include/asm/rwsem.h +++ b/arch/alpha/include/asm/rwsem.h @@ -26,20 +26,6 @@ extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) -#define __RWSEM_INITIALIZER(name) \ - { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \ - LIST_HEAD_INIT((name).wait_list) } - -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) - -static inline void init_rwsem(struct rw_semaphore *sem) -{ - sem->count = RWSEM_UNLOCKED_VALUE; - spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); -} - static inline void __down_read(struct rw_semaphore *sem) { long oldcount; diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h index 1fe465804dc7..379854630e9d 100644 --- a/arch/ia64/include/asm/rwsem.h +++ b/arch/ia64/include/asm/rwsem.h @@ -34,26 +34,11 @@ #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) -#define __RWSEM_INITIALIZER(name) \ - { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \ - LIST_HEAD_INIT((name).wait_list) } - -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) - extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); -static inline void -init_rwsem (struct rw_semaphore *sem) -{ - sem->count = RWSEM_UNLOCKED_VALUE; - spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); -} - /* * lock for reading */ diff --git a/arch/powerpc/include/asm/rwsem.h b/arch/powerpc/include/asm/rwsem.h index bc1acc229223..f86fdf743afb 100644 --- a/arch/powerpc/include/asm/rwsem.h +++ b/arch/powerpc/include/asm/rwsem.h @@ -28,38 +28,11 @@ #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } -#else -# define __RWSEM_DEP_MAP_INIT(lockname) -#endif - -#define __RWSEM_INITIALIZER(name) \ -{ \ - RWSEM_UNLOCKED_VALUE, \ - __SPIN_LOCK_UNLOCKED((name).wait_lock), \ - LIST_HEAD_INIT((name).wait_list) \ - __RWSEM_DEP_MAP_INIT(name) \ -} - -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) - extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key); - -#define init_rwsem(sem) \ - do { \ - static struct lock_class_key __key; \ - \ - __init_rwsem((sem), #sem, &__key); \ - } while (0) - /* * lock for reading */ diff --git a/arch/s390/include/asm/rwsem.h b/arch/s390/include/asm/rwsem.h index 6e075f1d97b4..f0f527756ee1 100644 --- a/arch/s390/include/asm/rwsem.h +++ b/arch/s390/include/asm/rwsem.h @@ -63,41 +63,6 @@ extern struct rw_semaphore *rwsem_downgrade_write(struct rw_semaphore *); #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) -/* - * initialisation - */ - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } -#else -# define __RWSEM_DEP_MAP_INIT(lockname) -#endif - -#define __RWSEM_INITIALIZER(name) \ - { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait.lock), \ - LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) } - -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) - -static inline void init_rwsem(struct rw_semaphore *sem) -{ - sem->count = RWSEM_UNLOCKED_VALUE; - spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); -} - -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key); - -#define init_rwsem(sem) \ -do { \ - static struct lock_class_key __key; \ - \ - __init_rwsem((sem), #sem, &__key); \ -} while (0) - - /* * lock for reading */ diff --git a/arch/sh/include/asm/rwsem.h b/arch/sh/include/asm/rwsem.h index dffc62589f79..798699d0687b 100644 --- a/arch/sh/include/asm/rwsem.h +++ b/arch/sh/include/asm/rwsem.h @@ -19,42 +19,11 @@ #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } -#else -# define __RWSEM_DEP_MAP_INIT(lockname) -#endif - -#define __RWSEM_INITIALIZER(name) \ - { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \ - LIST_HEAD_INIT((name).wait_list) \ - __RWSEM_DEP_MAP_INIT(name) } - -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) - extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key); - -#define init_rwsem(sem) \ -do { \ - static struct lock_class_key __key; \ - \ - __init_rwsem((sem), #sem, &__key); \ -} while (0) - -static inline void init_rwsem(struct rw_semaphore *sem) -{ - sem->count = RWSEM_UNLOCKED_VALUE; - spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); -} - /* * lock for reading */ diff --git a/arch/sparc/include/asm/rwsem.h b/arch/sparc/include/asm/rwsem.h index 4c16d1de2ab5..79f7c1ca6f91 100644 --- a/arch/sparc/include/asm/rwsem.h +++ b/arch/sparc/include/asm/rwsem.h @@ -20,34 +20,11 @@ #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } -#else -# define __RWSEM_DEP_MAP_INIT(lockname) -#endif - -#define __RWSEM_INITIALIZER(name) \ -{ RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \ - LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) } - -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) - extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key); - -#define init_rwsem(sem) \ -do { \ - static struct lock_class_key __key; \ - \ - __init_rwsem((sem), #sem, &__key); \ -} while (0) - /* * lock for reading */ diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index 995cfe4c985d..c0c91b4ecc81 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -66,31 +66,6 @@ extern asmregparm struct rw_semaphore * #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } -#else -# define __RWSEM_DEP_MAP_INIT(lockname) -#endif - -#define __RWSEM_INITIALIZER(name) \ -{ \ - RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \ - LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) \ -} - -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) - -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key); - -#define init_rwsem(sem) \ -do { \ - static struct lock_class_key __key; \ - \ - __init_rwsem((sem), #sem, &__key); \ -} while (0) - /* * lock for reading */ diff --git a/arch/xtensa/include/asm/rwsem.h b/arch/xtensa/include/asm/rwsem.h index 585cab9b0bdf..2fa420212353 100644 --- a/arch/xtensa/include/asm/rwsem.h +++ b/arch/xtensa/include/asm/rwsem.h @@ -24,25 +24,11 @@ #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) -#define __RWSEM_INITIALIZER(name) \ - { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \ - LIST_HEAD_INIT((name).wait_list) } - -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) - extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); -static inline void init_rwsem(struct rw_semaphore *sem) -{ - sem->count = RWSEM_UNLOCKED_VALUE; - spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); -} - /* * lock for reading */ diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h index 8c0dc7fc07a4..34701241b673 100644 --- a/include/linux/rwsem-spinlock.h +++ b/include/linux/rwsem-spinlock.h @@ -29,28 +29,7 @@ struct rw_semaphore { #endif }; -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } -#else -# define __RWSEM_DEP_MAP_INIT(lockname) -#endif - -#define __RWSEM_INITIALIZER(name) \ -{ 0, __SPIN_LOCK_UNLOCKED(name.wait_lock), LIST_HEAD_INIT((name).wait_list) \ - __RWSEM_DEP_MAP_INIT(name) } - -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) - -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key); - -#define init_rwsem(sem) \ -do { \ - static struct lock_class_key __key; \ - \ - __init_rwsem((sem), #sem, &__key); \ -} while (0) +#define RWSEM_UNLOCKED_VALUE 0x00000000 extern void __down_read(struct rw_semaphore *sem); extern int __down_read_trylock(struct rw_semaphore *sem); diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index e8be18edb664..8970c3e802e2 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -36,6 +36,31 @@ struct rw_semaphore { #include #endif +/* Common initializer macros and functions */ + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } +#else +# define __RWSEM_DEP_MAP_INIT(lockname) +#endif + +#define __RWSEM_INITIALIZER(name) \ + { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED(name.wait_lock), \ + LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) } + +#define DECLARE_RWSEM(name) \ + struct rw_semaphore name = __RWSEM_INITIALIZER(name) + +extern void __init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key); + +#define init_rwsem(sem) \ +do { \ + static struct lock_class_key __key; \ + \ + __init_rwsem((sem), #sem, &__key); \ +} while (0) + /* * lock for reading */ From 41e5887fa39ab272d9266a09cbefdef270e28b93 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 26 Jan 2011 20:06:03 +0000 Subject: [PATCH 065/706] rwsem: Unify the duplicate rwsem_is_locked() inlines Instead of having the same implementation in each architecture, move it to linux/rwsem.h and remove the duplicates. It's unlikely that an arch will ever implement something different, but we can deal with that when it happens. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: David Howells Cc: Benjamin Herrenschmidt Cc: Matt Turner Acked-by: Tony Luck Acked-by: Heiko Carstens Cc: Paul Mundt Acked-by: David Miller Cc: Chris Zankel LKML-Reference: <20110126195833.876773757@linutronix.de> Signed-off-by: Thomas Gleixner --- arch/alpha/include/asm/rwsem.h | 5 ----- arch/ia64/include/asm/rwsem.h | 5 ----- arch/powerpc/include/asm/rwsem.h | 5 ----- arch/s390/include/asm/rwsem.h | 5 ----- arch/sh/include/asm/rwsem.h | 5 ----- arch/sparc/include/asm/rwsem.h | 5 ----- arch/x86/include/asm/rwsem.h | 5 ----- arch/xtensa/include/asm/rwsem.h | 5 ----- include/linux/rwsem.h | 7 +++++++ 9 files changed, 7 insertions(+), 40 deletions(-) diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h index c9f5b90e926e..fc7f54337c3a 100644 --- a/arch/alpha/include/asm/rwsem.h +++ b/arch/alpha/include/asm/rwsem.h @@ -224,10 +224,5 @@ static inline long rwsem_atomic_update(long val, struct rw_semaphore *sem) #endif } -static inline int rwsem_is_locked(struct rw_semaphore *sem) -{ - return (sem->count != 0); -} - #endif /* __KERNEL__ */ #endif /* _ALPHA_RWSEM_H */ diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h index 379854630e9d..148b61a96b8c 100644 --- a/arch/ia64/include/asm/rwsem.h +++ b/arch/ia64/include/asm/rwsem.h @@ -147,9 +147,4 @@ __downgrade_write (struct rw_semaphore *sem) #define rwsem_atomic_add(delta, sem) atomic64_add(delta, (atomic64_t *)(&(sem)->count)) #define rwsem_atomic_update(delta, sem) atomic64_add_return(delta, (atomic64_t *)(&(sem)->count)) -static inline int rwsem_is_locked(struct rw_semaphore *sem) -{ - return (sem->count != 0); -} - #endif /* _ASM_IA64_RWSEM_H */ diff --git a/arch/powerpc/include/asm/rwsem.h b/arch/powerpc/include/asm/rwsem.h index f86fdf743afb..38e8e5c508d5 100644 --- a/arch/powerpc/include/asm/rwsem.h +++ b/arch/powerpc/include/asm/rwsem.h @@ -133,10 +133,5 @@ static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem) return atomic_long_add_return(delta, (atomic_long_t *)&sem->count); } -static inline int rwsem_is_locked(struct rw_semaphore *sem) -{ - return sem->count != 0; -} - #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_RWSEM_H */ diff --git a/arch/s390/include/asm/rwsem.h b/arch/s390/include/asm/rwsem.h index f0f527756ee1..80522dcb6cc2 100644 --- a/arch/s390/include/asm/rwsem.h +++ b/arch/s390/include/asm/rwsem.h @@ -325,10 +325,5 @@ static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem) return new; } -static inline int rwsem_is_locked(struct rw_semaphore *sem) -{ - return (sem->count != 0); -} - #endif /* __KERNEL__ */ #endif /* _S390_RWSEM_H */ diff --git a/arch/sh/include/asm/rwsem.h b/arch/sh/include/asm/rwsem.h index 798699d0687b..2f8cf9761eb5 100644 --- a/arch/sh/include/asm/rwsem.h +++ b/arch/sh/include/asm/rwsem.h @@ -133,10 +133,5 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) return atomic_add_return(delta, (atomic_t *)(&sem->count)); } -static inline int rwsem_is_locked(struct rw_semaphore *sem) -{ - return (sem->count != 0); -} - #endif /* __KERNEL__ */ #endif /* _ASM_SH_RWSEM_H */ diff --git a/arch/sparc/include/asm/rwsem.h b/arch/sparc/include/asm/rwsem.h index 79f7c1ca6f91..4f4391fd5658 100644 --- a/arch/sparc/include/asm/rwsem.h +++ b/arch/sparc/include/asm/rwsem.h @@ -124,11 +124,6 @@ static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem) return atomic64_add_return(delta, (atomic64_t *)(&sem->count)); } -static inline int rwsem_is_locked(struct rw_semaphore *sem) -{ - return (sem->count != 0); -} - #endif /* __KERNEL__ */ #endif /* _SPARC64_RWSEM_H */ diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index c0c91b4ecc81..776d76fe2f90 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -222,10 +222,5 @@ static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem) return tmp + delta; } -static inline int rwsem_is_locked(struct rw_semaphore *sem) -{ - return (sem->count != 0); -} - #endif /* __KERNEL__ */ #endif /* _ASM_X86_RWSEM_H */ diff --git a/arch/xtensa/include/asm/rwsem.h b/arch/xtensa/include/asm/rwsem.h index 2fa420212353..da61633ccba8 100644 --- a/arch/xtensa/include/asm/rwsem.h +++ b/arch/xtensa/include/asm/rwsem.h @@ -133,9 +133,4 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) return atomic_add_return(delta, (atomic_t *)(&sem->count)); } -static inline int rwsem_is_locked(struct rw_semaphore *sem) -{ - return (sem->count != 0); -} - #endif /* _XTENSA_RWSEM_H */ diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 8970c3e802e2..8b9c7e9f6910 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -34,6 +34,13 @@ struct rw_semaphore { /* Include the arch specific part */ #include + +/* In all implementations count != 0 means locked */ +static inline int rwsem_is_locked(struct rw_semaphore *sem) +{ + return sem->count != 0; +} + #endif /* Common initializer macros and functions */ From aac72277fda6ef788bb8d5deaa502ce9b9b6e472 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 26 Jan 2011 20:06:06 +0000 Subject: [PATCH 066/706] rwsem: Move duplicate function prototypes to linux/rwsem.h All architecture specific rwsem headers carry the same function prototypes. Just x86 adds asmregparm, which is an empty define on all other architectures. S390 has a stale rwsem_downgrade_write() prototype. Remove the duplicates and add the prototypes to linux/rwsem.h Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: David Howells Cc: Benjamin Herrenschmidt Cc: Richard Henderson Acked-by: Tony Luck Acked-by: Heiko Carstens Cc: Paul Mundt Acked-by: David Miller Cc: Chris Zankel LKML-Reference: <20110126195833.970840140@linutronix.de> Signed-off-by: Thomas Gleixner --- arch/alpha/include/asm/rwsem.h | 5 ----- arch/ia64/include/asm/rwsem.h | 5 ----- arch/powerpc/include/asm/rwsem.h | 5 ----- arch/s390/include/asm/rwsem.h | 6 ------ arch/sh/include/asm/rwsem.h | 5 ----- arch/sparc/include/asm/rwsem.h | 5 ----- arch/x86/include/asm/rwsem.h | 9 --------- arch/xtensa/include/asm/rwsem.h | 5 ----- include/linux/rwsem.h | 5 +++++ 9 files changed, 5 insertions(+), 45 deletions(-) diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h index fc7f54337c3a..a83bbea62c67 100644 --- a/arch/alpha/include/asm/rwsem.h +++ b/arch/alpha/include/asm/rwsem.h @@ -14,11 +14,6 @@ #include -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); - #define RWSEM_UNLOCKED_VALUE 0x0000000000000000L #define RWSEM_ACTIVE_BIAS 0x0000000000000001L #define RWSEM_ACTIVE_MASK 0x00000000ffffffffL diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h index 148b61a96b8c..3027e7516d85 100644 --- a/arch/ia64/include/asm/rwsem.h +++ b/arch/ia64/include/asm/rwsem.h @@ -34,11 +34,6 @@ #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); - /* * lock for reading */ diff --git a/arch/powerpc/include/asm/rwsem.h b/arch/powerpc/include/asm/rwsem.h index 38e8e5c508d5..bb1e2cdeb9bf 100644 --- a/arch/powerpc/include/asm/rwsem.h +++ b/arch/powerpc/include/asm/rwsem.h @@ -28,11 +28,6 @@ #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); - /* * lock for reading */ diff --git a/arch/s390/include/asm/rwsem.h b/arch/s390/include/asm/rwsem.h index 80522dcb6cc2..d0eb4653cebd 100644 --- a/arch/s390/include/asm/rwsem.h +++ b/arch/s390/include/asm/rwsem.h @@ -43,12 +43,6 @@ #ifdef __KERNEL__ -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *); -extern struct rw_semaphore *rwsem_downgrade_write(struct rw_semaphore *); - #ifndef __s390x__ #define RWSEM_UNLOCKED_VALUE 0x00000000 #define RWSEM_ACTIVE_BIAS 0x00000001 diff --git a/arch/sh/include/asm/rwsem.h b/arch/sh/include/asm/rwsem.h index 2f8cf9761eb5..edab57265293 100644 --- a/arch/sh/include/asm/rwsem.h +++ b/arch/sh/include/asm/rwsem.h @@ -19,11 +19,6 @@ #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); - /* * lock for reading */ diff --git a/arch/sparc/include/asm/rwsem.h b/arch/sparc/include/asm/rwsem.h index 4f4391fd5658..069bf4d663a1 100644 --- a/arch/sparc/include/asm/rwsem.h +++ b/arch/sparc/include/asm/rwsem.h @@ -20,11 +20,6 @@ #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); - /* * lock for reading */ diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index 776d76fe2f90..df4cd32b4cc6 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -39,15 +39,6 @@ #ifdef __KERNEL__ #include -extern asmregparm struct rw_semaphore * - rwsem_down_read_failed(struct rw_semaphore *sem); -extern asmregparm struct rw_semaphore * - rwsem_down_write_failed(struct rw_semaphore *sem); -extern asmregparm struct rw_semaphore * - rwsem_wake(struct rw_semaphore *); -extern asmregparm struct rw_semaphore * - rwsem_downgrade_wake(struct rw_semaphore *sem); - /* * The bias values and the counter type limits the number of * potential readers/writers to 32767 for 32 bits and 2147483647 diff --git a/arch/xtensa/include/asm/rwsem.h b/arch/xtensa/include/asm/rwsem.h index da61633ccba8..249619e7e7f2 100644 --- a/arch/xtensa/include/asm/rwsem.h +++ b/arch/xtensa/include/asm/rwsem.h @@ -24,11 +24,6 @@ #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); - /* * lock for reading */ diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 8b9c7e9f6910..f7c957f3165f 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -32,6 +32,11 @@ struct rw_semaphore { #endif }; +extern asmregparm struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); +extern asmregparm struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); +extern asmregparm struct rw_semaphore *rwsem_wake(struct rw_semaphore *); +extern asmregparm struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); + /* Include the arch specific part */ #include From d123375425d7df4b6081a631fc1203fceafa59b2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 26 Jan 2011 21:32:01 +0100 Subject: [PATCH 067/706] rwsem: Remove redundant asmregparm annotation Peter Zijlstra pointed out, that the only user of asmregparm (x86) is compiling the kernel already with -mregparm=3. So the annotation of the rwsem functions is redundant. Remove it. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: David Howells Cc: Benjamin Herrenschmidt Cc: Matt Turner Cc: Tony Luck Cc: Heiko Carstens Cc: Paul Mundt Cc: David Miller Cc: Chris Zankel LKML-Reference: Signed-off-by: Thomas Gleixner --- include/linux/rwsem.h | 8 ++++---- lib/rwsem.c | 10 ++++------ 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index f7c957f3165f..a8afe9cd000c 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -32,10 +32,10 @@ struct rw_semaphore { #endif }; -extern asmregparm struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern asmregparm struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern asmregparm struct rw_semaphore *rwsem_wake(struct rw_semaphore *); -extern asmregparm struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *); +extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); /* Include the arch specific part */ #include diff --git a/lib/rwsem.c b/lib/rwsem.c index f236d7cd5cf3..aa7c3052261f 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -222,8 +222,7 @@ rwsem_down_failed_common(struct rw_semaphore *sem, /* * wait for the read lock to be granted */ -asmregparm struct rw_semaphore __sched * -rwsem_down_read_failed(struct rw_semaphore *sem) +struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) { return rwsem_down_failed_common(sem, RWSEM_WAITING_FOR_READ, -RWSEM_ACTIVE_READ_BIAS); @@ -232,8 +231,7 @@ rwsem_down_read_failed(struct rw_semaphore *sem) /* * wait for the write lock to be granted */ -asmregparm struct rw_semaphore __sched * -rwsem_down_write_failed(struct rw_semaphore *sem) +struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) { return rwsem_down_failed_common(sem, RWSEM_WAITING_FOR_WRITE, -RWSEM_ACTIVE_WRITE_BIAS); @@ -243,7 +241,7 @@ rwsem_down_write_failed(struct rw_semaphore *sem) * handle waking up a waiter on the semaphore * - up_read/up_write has decremented the active part of count if we come here */ -asmregparm struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) +struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) { unsigned long flags; @@ -263,7 +261,7 @@ asmregparm struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) * - caller incremented waiting part of count and discovered it still negative * - just wake up any readers at the front of the queue */ -asmregparm struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) +struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) { unsigned long flags; From 6ea72f12069306b235151c5b05ac0cca7e1dedfa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 26 Jan 2011 13:36:03 +0100 Subject: [PATCH 068/706] sched: Avoid expensive initial update_cfs_load(), on UP too Fix the build on UP. Signed-off-by: Peter Zijlstra Cc: Paul Turner LKML-Reference: <20110122044852.102126037@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched.c b/kernel/sched.c index 78fa75394011..477e1bcc63f9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7922,7 +7922,9 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) #ifdef CONFIG_FAIR_GROUP_SCHED cfs_rq->rq = rq; /* allow initial update_cfs_load() to truncate */ +#ifdef CONFIG_SMP cfs_rq->load_stamp = 1; +#endif #endif cfs_rq->min_vruntime = (u64)(-(1LL << 20)); } From dda99116969142cc41e945a1047a419b937536af Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 21 Jan 2011 15:30:01 -0800 Subject: [PATCH 069/706] x86, perf: Change two init functions to static init_hw_perf_events() is called via early_initcall now. x86_pmu_event_init is x86_pmu member function. So we can change them to static. Signed-off-by: Yinghai Lu Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: <4D3A16F9.109@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 9d977a2ea693..4d98789b0664 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1389,7 +1389,7 @@ static void __init pmu_check_apic(void) pr_info("no hardware sampling interrupt available.\n"); } -int __init init_hw_perf_events(void) +static int __init init_hw_perf_events(void) { struct event_constraint *c; int err; @@ -1608,7 +1608,7 @@ out: return ret; } -int x86_pmu_event_init(struct perf_event *event) +static int x86_pmu_event_init(struct perf_event *event) { struct pmu *tmp; int err; From 8161239a8bcce9ad6b537c04a1fa3b5c68bae693 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 14 Jan 2011 17:09:41 +0800 Subject: [PATCH 070/706] rtmutex: Simplify PI algorithm and make highest prio task get lock In current rtmutex, the pending owner may be boosted by the tasks in the rtmutex's waitlist when the pending owner is deboosted or a task in the waitlist is boosted. This boosting is unrelated, because the pending owner does not really take the rtmutex. It is not reasonable. Example. time1: A(high prio) onwers the rtmutex. B(mid prio) and C (low prio) in the waitlist. time2 A release the lock, B becomes the pending owner A(or other high prio task) continues to run. B's prio is lower than A, so B is just queued at the runqueue. time3 A or other high prio task sleeps, but we have passed some time The B and C's prio are changed in the period (time2 ~ time3) due to boosting or deboosting. Now C has the priority higher than B. ***Is it reasonable that C has to boost B and help B to get the rtmutex? NO!! I think, it is unrelated/unneed boosting before B really owns the rtmutex. We should give C a chance to beat B and win the rtmutex. This is the motivation of this patch. This patch *ensures* only the top waiter or higher priority task can take the lock. How? 1) we don't dequeue the top waiter when unlock, if the top waiter is changed, the old top waiter will fail and go to sleep again. 2) when requiring lock, it will get the lock when the lock is not taken and: there is no waiter OR higher priority than waiters OR it is top waiter. 3) In any time, the top waiter is changed, the top waiter will be woken up. The algorithm is much simpler than before, no pending owner, no boosting for pending owner. Other advantage of this patch: 1) The states of a rtmutex are reduced a half, easier to read the code. 2) the codes become shorter. 3) top waiter is not dequeued until it really take the lock: they will retain FIFO when it is stolen. Not advantage nor disadvantage 1) Even we may wakeup multiple waiters(any time when top waiter changed), we hardly cause "thundering herd", the number of wokenup task is likely 1 or very little. 2) two APIs are changed. rt_mutex_owner() will not return pending owner, it will return NULL when the top waiter is going to take the lock. rt_mutex_next_owner() always return the top waiter. will not return NULL if we have waiters because the top waiter is not dequeued. I have fixed the code that use these APIs. need updated after this patch is accepted 1) Document/* 2) the testcase scripts/rt-tester/t4-l2-pi-deboost.tst Signed-off-by: Lai Jiangshan LKML-Reference: <4D3012D5.4060709@cn.fujitsu.com> Reviewed-by: Steven Rostedt Signed-off-by: Steven Rostedt --- kernel/futex.c | 22 +-- kernel/rtmutex-debug.c | 1 - kernel/rtmutex.c | 318 ++++++++++++++-------------------------- kernel/rtmutex_common.h | 16 +- 4 files changed, 127 insertions(+), 230 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index b766d28accd6..64c38115c7b6 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1556,10 +1556,10 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, /* * We are here either because we stole the rtmutex from the - * pending owner or we are the pending owner which failed to - * get the rtmutex. We have to replace the pending owner TID - * in the user space variable. This must be atomic as we have - * to preserve the owner died bit here. + * previous highest priority waiter or we are the highest priority + * waiter but failed to get the rtmutex the first time. + * We have to replace the newowner TID in the user space variable. + * This must be atomic as we have to preserve the owner died bit here. * * Note: We write the user space value _before_ changing the pi_state * because we can fault here. Imagine swapped out pages or a fork @@ -1608,8 +1608,8 @@ retry: /* * To handle the page fault we need to drop the hash bucket - * lock here. That gives the other task (either the pending - * owner itself or the task which stole the rtmutex) the + * lock here. That gives the other task (either the highest priority + * waiter itself or the task which stole the rtmutex) the * chance to try the fixup of the pi_state. So once we are * back from handling the fault we need to check the pi_state * after reacquiring the hash bucket lock and before trying to @@ -1685,18 +1685,20 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) /* * pi_state is incorrect, some other task did a lock steal and * we returned due to timeout or signal without taking the - * rt_mutex. Too late. We can access the rt_mutex_owner without - * locking, as the other task is now blocked on the hash bucket - * lock. Fix the state up. + * rt_mutex. Too late. */ + raw_spin_lock(&q->pi_state->pi_mutex.wait_lock); owner = rt_mutex_owner(&q->pi_state->pi_mutex); + if (!owner) + owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); + raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock); ret = fixup_pi_state_owner(uaddr, q, owner); goto out; } /* * Paranoia check. If we did not take the lock, then we should not be - * the owner, nor the pending owner, of the rt_mutex. + * the owner of the rt_mutex. */ if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index ddabb54bb5c8..3c7cbc2c33be 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c @@ -215,7 +215,6 @@ void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) put_pid(waiter->deadlock_task_pid); TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); - TRACE_WARN_ON(waiter->task); memset(waiter, 0x22, sizeof(*waiter)); } diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index a9604815786a..ab449117aaf2 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -20,41 +20,34 @@ /* * lock->owner state tracking: * - * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1 - * are used to keep track of the "owner is pending" and "lock has - * waiters" state. + * lock->owner holds the task_struct pointer of the owner. Bit 0 + * is used to keep track of the "lock has waiters" state. * - * owner bit1 bit0 - * NULL 0 0 lock is free (fast acquire possible) - * NULL 0 1 invalid state - * NULL 1 0 Transitional State* - * NULL 1 1 invalid state - * taskpointer 0 0 lock is held (fast release possible) - * taskpointer 0 1 task is pending owner - * taskpointer 1 0 lock is held and has waiters - * taskpointer 1 1 task is pending owner and lock has more waiters - * - * Pending ownership is assigned to the top (highest priority) - * waiter of the lock, when the lock is released. The thread is woken - * up and can now take the lock. Until the lock is taken (bit 0 - * cleared) a competing higher priority thread can steal the lock - * which puts the woken up thread back on the waiters list. + * owner bit0 + * NULL 0 lock is free (fast acquire possible) + * NULL 1 lock is free and has waiters and the top waiter + * is going to take the lock* + * taskpointer 0 lock is held (fast release possible) + * taskpointer 1 lock is held and has waiters** * * The fast atomic compare exchange based acquire and release is only - * possible when bit 0 and 1 of lock->owner are 0. + * possible when bit 0 of lock->owner is 0. * - * (*) There's a small time where the owner can be NULL and the - * "lock has waiters" bit is set. This can happen when grabbing the lock. - * To prevent a cmpxchg of the owner releasing the lock, we need to set this - * bit before looking at the lock, hence the reason this is a transitional - * state. + * (*) It also can be a transitional state when grabbing the lock + * with ->wait_lock is held. To prevent any fast path cmpxchg to the lock, + * we need to set the bit0 before looking at the lock, and the owner may be + * NULL in this small time, hence this can be a transitional state. + * + * (**) There is a small time when bit 0 is set but there are no + * waiters. This can happen when grabbing the lock in the slow path. + * To prevent a cmpxchg of the owner releasing the lock, we need to + * set this bit before looking at the lock. */ static void -rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, - unsigned long mask) +rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner) { - unsigned long val = (unsigned long)owner | mask; + unsigned long val = (unsigned long)owner; if (rt_mutex_has_waiters(lock)) val |= RT_MUTEX_HAS_WAITERS; @@ -203,15 +196,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * reached or the state of the chain has changed while we * dropped the locks. */ - if (!waiter || !waiter->task) + if (!waiter) goto out_unlock_pi; /* * Check the orig_waiter state. After we dropped the locks, - * the previous owner of the lock might have released the lock - * and made us the pending owner: + * the previous owner of the lock might have released the lock. */ - if (orig_waiter && !orig_waiter->task) + if (orig_waiter && !rt_mutex_owner(orig_lock)) goto out_unlock_pi; /* @@ -254,6 +246,17 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* Release the task */ raw_spin_unlock_irqrestore(&task->pi_lock, flags); + if (!rt_mutex_owner(lock)) { + /* + * If the requeue above changed the top waiter, then we need + * to wake the new top waiter up to try to get the lock. + */ + + if (top_waiter != rt_mutex_top_waiter(lock)) + wake_up_process(rt_mutex_top_waiter(lock)->task); + raw_spin_unlock(&lock->wait_lock); + goto out_put_task; + } put_task_struct(task); /* Grab the next task */ @@ -295,79 +298,17 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, return ret; } -/* - * Optimization: check if we can steal the lock from the - * assigned pending owner [which might not have taken the - * lock yet]: - */ -static inline int try_to_steal_lock(struct rt_mutex *lock, - struct task_struct *task) -{ - struct task_struct *pendowner = rt_mutex_owner(lock); - struct rt_mutex_waiter *next; - unsigned long flags; - - if (!rt_mutex_owner_pending(lock)) - return 0; - - if (pendowner == task) - return 1; - - raw_spin_lock_irqsave(&pendowner->pi_lock, flags); - if (task->prio >= pendowner->prio) { - raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); - return 0; - } - - /* - * Check if a waiter is enqueued on the pending owners - * pi_waiters list. Remove it and readjust pending owners - * priority. - */ - if (likely(!rt_mutex_has_waiters(lock))) { - raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); - return 1; - } - - /* No chain handling, pending owner is not blocked on anything: */ - next = rt_mutex_top_waiter(lock); - plist_del(&next->pi_list_entry, &pendowner->pi_waiters); - __rt_mutex_adjust_prio(pendowner); - raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); - - /* - * We are going to steal the lock and a waiter was - * enqueued on the pending owners pi_waiters queue. So - * we have to enqueue this waiter into - * task->pi_waiters list. This covers the case, - * where task is boosted because it holds another - * lock and gets unboosted because the booster is - * interrupted, so we would delay a waiter with higher - * priority as task->normal_prio. - * - * Note: in the rare case of a SCHED_OTHER task changing - * its priority and thus stealing the lock, next->task - * might be task: - */ - if (likely(next->task != task)) { - raw_spin_lock_irqsave(&task->pi_lock, flags); - plist_add(&next->pi_list_entry, &task->pi_waiters); - __rt_mutex_adjust_prio(task); - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - } - return 1; -} - /* * Try to take an rt-mutex * - * This fails - * - when the lock has a real owner - * - when a different pending owner exists and has higher priority than current - * * Must be called with lock->wait_lock held. + * + * @lock: the lock to be acquired. + * @task: the task which wants to acquire the lock + * @waiter: the waiter that is queued to the lock's wait list. (could be NULL) */ -static int try_to_take_rt_mutex(struct rt_mutex *lock) +static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, + struct rt_mutex_waiter *waiter) { /* * We have to be careful here if the atomic speedups are @@ -390,15 +331,52 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock) */ mark_rt_mutex_waiters(lock); - if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current)) + if (rt_mutex_owner(lock)) return 0; + /* + * It will get the lock because of one of these conditions: + * 1) there is no waiter + * 2) higher priority than waiters + * 3) it is top waiter + */ + if (rt_mutex_has_waiters(lock)) { + if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) { + if (!waiter || waiter != rt_mutex_top_waiter(lock)) + return 0; + } + } + + if (waiter || rt_mutex_has_waiters(lock)) { + unsigned long flags; + struct rt_mutex_waiter *top; + + raw_spin_lock_irqsave(&task->pi_lock, flags); + + /* remove the queued waiter. */ + if (waiter) { + plist_del(&waiter->list_entry, &lock->wait_list); + task->pi_blocked_on = NULL; + } + + /* + * We have to enqueue the top waiter(if it exists) into + * task->pi_waiters list. + */ + if (rt_mutex_has_waiters(lock)) { + top = rt_mutex_top_waiter(lock); + top->pi_list_entry.prio = top->list_entry.prio; + plist_add(&top->pi_list_entry, &task->pi_waiters); + } + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + } + /* We got the lock. */ debug_rt_mutex_lock(lock); - rt_mutex_set_owner(lock, current, 0); + rt_mutex_set_owner(lock, task); - rt_mutex_deadlock_account_lock(lock, current); + rt_mutex_deadlock_account_lock(lock, task); return 1; } @@ -436,6 +414,9 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, raw_spin_unlock_irqrestore(&task->pi_lock, flags); + if (!owner) + return 0; + if (waiter == rt_mutex_top_waiter(lock)) { raw_spin_lock_irqsave(&owner->pi_lock, flags); plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); @@ -472,21 +453,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, /* * Wake up the next waiter on the lock. * - * Remove the top waiter from the current tasks waiter list and from - * the lock waiter list. Set it as pending owner. Then wake it up. + * Remove the top waiter from the current tasks waiter list and wake it up. * * Called with lock->wait_lock held. */ static void wakeup_next_waiter(struct rt_mutex *lock) { struct rt_mutex_waiter *waiter; - struct task_struct *pendowner; unsigned long flags; raw_spin_lock_irqsave(¤t->pi_lock, flags); waiter = rt_mutex_top_waiter(lock); - plist_del(&waiter->list_entry, &lock->wait_list); /* * Remove it from current->pi_waiters. We do not adjust a @@ -495,43 +473,19 @@ static void wakeup_next_waiter(struct rt_mutex *lock) * lock->wait_lock. */ plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); - pendowner = waiter->task; - waiter->task = NULL; - rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); + rt_mutex_set_owner(lock, NULL); raw_spin_unlock_irqrestore(¤t->pi_lock, flags); - /* - * Clear the pi_blocked_on variable and enqueue a possible - * waiter into the pi_waiters list of the pending owner. This - * prevents that in case the pending owner gets unboosted a - * waiter with higher priority than pending-owner->normal_prio - * is blocked on the unboosted (pending) owner. - */ - raw_spin_lock_irqsave(&pendowner->pi_lock, flags); - - WARN_ON(!pendowner->pi_blocked_on); - WARN_ON(pendowner->pi_blocked_on != waiter); - WARN_ON(pendowner->pi_blocked_on->lock != lock); - - pendowner->pi_blocked_on = NULL; - - if (rt_mutex_has_waiters(lock)) { - struct rt_mutex_waiter *next; - - next = rt_mutex_top_waiter(lock); - plist_add(&next->pi_list_entry, &pendowner->pi_waiters); - } - raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); - - wake_up_process(pendowner); + wake_up_process(waiter->task); } /* - * Remove a waiter from a lock + * Remove a waiter from a lock and give up * - * Must be called with lock->wait_lock held + * Must be called with lock->wait_lock held and + * have just failed to try_to_take_rt_mutex(). */ static void remove_waiter(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) @@ -543,11 +497,13 @@ static void remove_waiter(struct rt_mutex *lock, raw_spin_lock_irqsave(¤t->pi_lock, flags); plist_del(&waiter->list_entry, &lock->wait_list); - waiter->task = NULL; current->pi_blocked_on = NULL; raw_spin_unlock_irqrestore(¤t->pi_lock, flags); - if (first && owner != current) { + if (!owner) + return; + + if (first) { raw_spin_lock_irqsave(&owner->pi_lock, flags); @@ -614,21 +570,19 @@ void rt_mutex_adjust_pi(struct task_struct *task) * or TASK_UNINTERRUPTIBLE) * @timeout: the pre-initialized and started timer, or NULL for none * @waiter: the pre-initialized rt_mutex_waiter - * @detect_deadlock: passed to task_blocks_on_rt_mutex * * lock->wait_lock must be held by the caller. */ static int __sched __rt_mutex_slowlock(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, - struct rt_mutex_waiter *waiter, - int detect_deadlock) + struct rt_mutex_waiter *waiter) { int ret = 0; for (;;) { /* Try to acquire the lock: */ - if (try_to_take_rt_mutex(lock)) + if (try_to_take_rt_mutex(lock, current, waiter)) break; /* @@ -645,39 +599,11 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, break; } - /* - * waiter->task is NULL the first time we come here and - * when we have been woken up by the previous owner - * but the lock got stolen by a higher prio task. - */ - if (!waiter->task) { - ret = task_blocks_on_rt_mutex(lock, waiter, current, - detect_deadlock); - /* - * If we got woken up by the owner then start loop - * all over without going into schedule to try - * to get the lock now: - */ - if (unlikely(!waiter->task)) { - /* - * Reset the return value. We might - * have returned with -EDEADLK and the - * owner released the lock while we - * were walking the pi chain. - */ - ret = 0; - continue; - } - if (unlikely(ret)) - break; - } - raw_spin_unlock(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); - if (waiter->task) - schedule_rt_mutex(lock); + schedule_rt_mutex(lock); raw_spin_lock(&lock->wait_lock); set_current_state(state); @@ -698,12 +624,11 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, int ret = 0; debug_rt_mutex_init_waiter(&waiter); - waiter.task = NULL; raw_spin_lock(&lock->wait_lock); /* Try to acquire the lock again: */ - if (try_to_take_rt_mutex(lock)) { + if (try_to_take_rt_mutex(lock, current, NULL)) { raw_spin_unlock(&lock->wait_lock); return 0; } @@ -717,12 +642,14 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, timeout->task = NULL; } - ret = __rt_mutex_slowlock(lock, state, timeout, &waiter, - detect_deadlock); + ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock); + + if (likely(!ret)) + ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); set_current_state(TASK_RUNNING); - if (unlikely(waiter.task)) + if (unlikely(ret)) remove_waiter(lock, &waiter); /* @@ -737,14 +664,6 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, if (unlikely(timeout)) hrtimer_cancel(&timeout->timer); - /* - * Readjust priority, when we did not get the lock. We might - * have been the pending owner and boosted. Since we did not - * take the lock, the PI boost has to go. - */ - if (unlikely(ret)) - rt_mutex_adjust_prio(current); - debug_rt_mutex_free_waiter(&waiter); return ret; @@ -762,7 +681,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock) if (likely(rt_mutex_owner(lock) != current)) { - ret = try_to_take_rt_mutex(lock); + ret = try_to_take_rt_mutex(lock, current, NULL); /* * try_to_take_rt_mutex() sets the lock waiters * bit unconditionally. Clean this up. @@ -992,7 +911,7 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, { __rt_mutex_init(lock, NULL); debug_rt_mutex_proxy_lock(lock, proxy_owner); - rt_mutex_set_owner(lock, proxy_owner, 0); + rt_mutex_set_owner(lock, proxy_owner); rt_mutex_deadlock_account_lock(lock, proxy_owner); } @@ -1008,7 +927,7 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, struct task_struct *proxy_owner) { debug_rt_mutex_proxy_unlock(lock); - rt_mutex_set_owner(lock, NULL, 0); + rt_mutex_set_owner(lock, NULL); rt_mutex_deadlock_account_unlock(proxy_owner); } @@ -1034,20 +953,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, raw_spin_lock(&lock->wait_lock); - mark_rt_mutex_waiters(lock); - - if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) { - /* We got the lock for task. */ - debug_rt_mutex_lock(lock); - rt_mutex_set_owner(lock, task, 0); + if (try_to_take_rt_mutex(lock, task, NULL)) { raw_spin_unlock(&lock->wait_lock); - rt_mutex_deadlock_account_lock(lock, task); return 1; } ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); - if (ret && !waiter->task) { + if (ret && !rt_mutex_owner(lock)) { /* * Reset the return value. We might have * returned with -EDEADLK and the owner @@ -1056,6 +969,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, */ ret = 0; } + + if (unlikely(ret)) + remove_waiter(lock, waiter); + raw_spin_unlock(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); @@ -1110,12 +1027,11 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, set_current_state(TASK_INTERRUPTIBLE); - ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, - detect_deadlock); + ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); set_current_state(TASK_RUNNING); - if (unlikely(waiter->task)) + if (unlikely(ret)) remove_waiter(lock, waiter); /* @@ -1126,13 +1042,5 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, raw_spin_unlock(&lock->wait_lock); - /* - * Readjust priority, when we did not get the lock. We might have been - * the pending owner and boosted. Since we did not take the lock, the - * PI boost has to go. - */ - if (unlikely(ret)) - rt_mutex_adjust_prio(current); - return ret; } diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index 97a2f81866af..53a66c85261b 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h @@ -91,9 +91,8 @@ task_top_pi_waiter(struct task_struct *p) /* * lock->owner state tracking: */ -#define RT_MUTEX_OWNER_PENDING 1UL -#define RT_MUTEX_HAS_WAITERS 2UL -#define RT_MUTEX_OWNER_MASKALL 3UL +#define RT_MUTEX_HAS_WAITERS 1UL +#define RT_MUTEX_OWNER_MASKALL 1UL static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) { @@ -101,17 +100,6 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); } -static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock) -{ - return (struct task_struct *) - ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); -} - -static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock) -{ - return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING; -} - /* * PI-futex support (proxy locking functions, etc.): */ From 68baa431ec2f14ba7510d4e79bceb6ceaf0d3b74 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 20 Jan 2011 23:15:30 +0900 Subject: [PATCH 071/706] perf tools: Add strfilter for general purpose string filter Add strfilter for general purpose string filter. Every filter rules are descrived by glob matching pattern and '!' prefix which means Logical NOT. A strfilter consists of those filter rules connected with '&' and '|'. A set of rules can be folded by using '(' and ')'. It also accepts spaces around rules and those operators. Format: ::= | "!" | | "(" ")" ::= "&" | "|" e.g.: "(add* | del*) & *timer" filter rules pass strings which start with add or del and end with timer. This will be used by perf probe --filter. Changes in V2: - Fix to check result of strdup() and strfilter__alloc(). - Encapsulate and simplify interfaces as like regex(3). Cc: 2nddept-manager@sdl.hitachi.co.jp Cc: Franck Bui-Huu Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Steven Rostedt LKML-Reference: <20110120141530.25915.12673.stgit@ltc236.sdl.hitachi.co.jp> Signed-off-by: Masami Hiramatsu Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile | 2 + tools/perf/util/strfilter.c | 200 ++++++++++++++++++++++++++++++++++++ tools/perf/util/strfilter.h | 48 +++++++++ 3 files changed, 250 insertions(+) create mode 100644 tools/perf/util/strfilter.c create mode 100644 tools/perf/util/strfilter.h diff --git a/tools/perf/Makefile b/tools/perf/Makefile index 638e8e146bb9..eedcf9541572 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -417,6 +417,7 @@ LIB_H += util/help.h LIB_H += util/session.h LIB_H += util/strbuf.h LIB_H += util/strlist.h +LIB_H += util/strfilter.h LIB_H += util/svghelper.h LIB_H += util/run-command.h LIB_H += util/sigchain.h @@ -458,6 +459,7 @@ LIB_OBJS += $(OUTPUT)util/quote.o LIB_OBJS += $(OUTPUT)util/strbuf.o LIB_OBJS += $(OUTPUT)util/string.o LIB_OBJS += $(OUTPUT)util/strlist.o +LIB_OBJS += $(OUTPUT)util/strfilter.o LIB_OBJS += $(OUTPUT)util/usage.o LIB_OBJS += $(OUTPUT)util/wrapper.o LIB_OBJS += $(OUTPUT)util/sigchain.o diff --git a/tools/perf/util/strfilter.c b/tools/perf/util/strfilter.c new file mode 100644 index 000000000000..4064b7d708b8 --- /dev/null +++ b/tools/perf/util/strfilter.c @@ -0,0 +1,200 @@ +#include +#include "util.h" +#include "string.h" +#include "strfilter.h" + +/* Operators */ +static const char *OP_and = "&"; /* Logical AND */ +static const char *OP_or = "|"; /* Logical OR */ +static const char *OP_not = "!"; /* Logical NOT */ + +#define is_operator(c) ((c) == '|' || (c) == '&' || (c) == '!') +#define is_separator(c) (is_operator(c) || (c) == '(' || (c) == ')') + +static void strfilter_node__delete(struct strfilter_node *self) +{ + if (self) { + if (self->p && !is_operator(*self->p)) + free((char *)self->p); + strfilter_node__delete(self->l); + strfilter_node__delete(self->r); + free(self); + } +} + +void strfilter__delete(struct strfilter *self) +{ + if (self) { + strfilter_node__delete(self->root); + free(self); + } +} + +static const char *get_token(const char *s, const char **e) +{ + const char *p; + + while (isspace(*s)) /* Skip spaces */ + s++; + + if (*s == '\0') { + p = s; + goto end; + } + + p = s + 1; + if (!is_separator(*s)) { + /* End search */ +retry: + while (*p && !is_separator(*p) && !isspace(*p)) + p++; + /* Escape and special case: '!' is also used in glob pattern */ + if (*(p - 1) == '\\' || (*p == '!' && *(p - 1) == '[')) { + p++; + goto retry; + } + } +end: + *e = p; + return s; +} + +static struct strfilter_node *strfilter_node__alloc(const char *op, + struct strfilter_node *l, + struct strfilter_node *r) +{ + struct strfilter_node *ret = zalloc(sizeof(struct strfilter_node)); + + if (ret) { + ret->p = op; + ret->l = l; + ret->r = r; + } + + return ret; +} + +static struct strfilter_node *strfilter_node__new(const char *s, + const char **ep) +{ + struct strfilter_node root, *cur, *last_op; + const char *e; + + if (!s) + return NULL; + + memset(&root, 0, sizeof(root)); + last_op = cur = &root; + + s = get_token(s, &e); + while (*s != '\0' && *s != ')') { + switch (*s) { + case '&': /* Exchg last OP->r with AND */ + if (!cur->r || !last_op->r) + goto error; + cur = strfilter_node__alloc(OP_and, last_op->r, NULL); + if (!cur) + goto nomem; + last_op->r = cur; + last_op = cur; + break; + case '|': /* Exchg the root with OR */ + if (!cur->r || !root.r) + goto error; + cur = strfilter_node__alloc(OP_or, root.r, NULL); + if (!cur) + goto nomem; + root.r = cur; + last_op = cur; + break; + case '!': /* Add NOT as a leaf node */ + if (cur->r) + goto error; + cur->r = strfilter_node__alloc(OP_not, NULL, NULL); + if (!cur->r) + goto nomem; + cur = cur->r; + break; + case '(': /* Recursively parses inside the parenthesis */ + if (cur->r) + goto error; + cur->r = strfilter_node__new(s + 1, &s); + if (!s) + goto nomem; + if (!cur->r || *s != ')') + goto error; + e = s + 1; + break; + default: + if (cur->r) + goto error; + cur->r = strfilter_node__alloc(NULL, NULL, NULL); + if (!cur->r) + goto nomem; + cur->r->p = strndup(s, e - s); + if (!cur->r->p) + goto nomem; + } + s = get_token(e, &e); + } + if (!cur->r) + goto error; + *ep = s; + return root.r; +nomem: + s = NULL; +error: + *ep = s; + strfilter_node__delete(root.r); + return NULL; +} + +/* + * Parse filter rule and return new strfilter. + * Return NULL if fail, and *ep == NULL if memory allocation failed. + */ +struct strfilter *strfilter__new(const char *rules, const char **err) +{ + struct strfilter *ret = zalloc(sizeof(struct strfilter)); + const char *ep = NULL; + + if (ret) + ret->root = strfilter_node__new(rules, &ep); + + if (!ret || !ret->root || *ep != '\0') { + if (err) + *err = ep; + strfilter__delete(ret); + ret = NULL; + } + + return ret; +} + +static bool strfilter_node__compare(struct strfilter_node *self, + const char *str) +{ + if (!self || !self->p) + return false; + + switch (*self->p) { + case '|': /* OR */ + return strfilter_node__compare(self->l, str) || + strfilter_node__compare(self->r, str); + case '&': /* AND */ + return strfilter_node__compare(self->l, str) && + strfilter_node__compare(self->r, str); + case '!': /* NOT */ + return !strfilter_node__compare(self->r, str); + default: + return strglobmatch(str, self->p); + } +} + +/* Return true if STR matches the filter rules */ +bool strfilter__compare(struct strfilter *self, const char *str) +{ + if (!self) + return false; + return strfilter_node__compare(self->root, str); +} diff --git a/tools/perf/util/strfilter.h b/tools/perf/util/strfilter.h new file mode 100644 index 000000000000..00f58a7506de --- /dev/null +++ b/tools/perf/util/strfilter.h @@ -0,0 +1,48 @@ +#ifndef __PERF_STRFILTER_H +#define __PERF_STRFILTER_H +/* General purpose glob matching filter */ + +#include +#include + +/* A node of string filter */ +struct strfilter_node { + struct strfilter_node *l; /* Tree left branche (for &,|) */ + struct strfilter_node *r; /* Tree right branche (for !,&,|) */ + const char *p; /* Operator or rule */ +}; + +/* String filter */ +struct strfilter { + struct strfilter_node *root; +}; + +/** + * strfilter__new - Create a new string filter + * @rules: Filter rule, which is a combination of glob expressions. + * @err: Pointer which points an error detected on @rules + * + * Parse @rules and return new strfilter. Return NULL if an error detected. + * In that case, *@err will indicate where it is detected, and *@err is NULL + * if a memory allocation is failed. + */ +struct strfilter *strfilter__new(const char *rules, const char **err); + +/** + * strfilter__compare - compare given string and a string filter + * @self: String filter + * @str: target string + * + * Compare @str and @self. Return true if the str match the rule + */ +bool strfilter__compare(struct strfilter *self, const char *str); + +/** + * strfilter__delete - delete a string filter + * @self: String filter to delete + * + * Delete @self. + */ +void strfilter__delete(struct strfilter *self); + +#endif From bd09d7b5efeb13965b6725b4a3e9944908bca9d2 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 20 Jan 2011 23:15:39 +0900 Subject: [PATCH 072/706] perf probe: Add variable filter support Add filters support for available variable list. Default filter is "!__k???tab_*&!__crc_*" for filtering out automatically generated symbols. The format of filter rule is "[!]GLOBPATTERN", so you can use wild cards. If the filter rule starts with '!', matched variables are filter out. e.g.: # perf probe -V schedule --externs --filter=cpu* Available variables at schedule @ cpumask_var_t cpu_callout_mask cpumask_var_t cpu_core_map cpumask_var_t cpu_isolated_map cpumask_var_t cpu_sibling_map int cpu_number long unsigned int* cpu_bit_bitmap ... Cc: 2nddept-manager@sdl.hitachi.co.jp Cc: Chase Douglas Cc: Franck Bui-Huu Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Steven Rostedt LKML-Reference: <20110120141539.25915.43401.stgit@ltc236.sdl.hitachi.co.jp> Signed-off-by: Masami Hiramatsu [ committer note: Removed the elf.h include as it was fixed up in e80711c] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-probe.txt | 14 +++++ tools/perf/builtin-probe.c | 34 +++++++++++++ tools/perf/util/probe-event.c | 68 +++++++++++++++---------- tools/perf/util/probe-event.h | 3 +- 4 files changed, 91 insertions(+), 28 deletions(-) diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt index fcc51fe0195c..32fb18f1695d 100644 --- a/tools/perf/Documentation/perf-probe.txt +++ b/tools/perf/Documentation/perf-probe.txt @@ -77,6 +77,12 @@ OPTIONS --funcs:: Show available functions in given module or kernel. +--filter=FILTER:: + (Only for --vars) Set filter for variables. FILTER is a combination of + glob pattern, see FILTER PATTERN for details. + Default FILTER is "!__k???tab_* & !__crc_*". + If several filters are specified, only the last filter is valid. + -f:: --force:: Forcibly add events with existing name. @@ -139,6 +145,14 @@ e.g. This provides some sort of flexibility and robustness to probe point definitions against minor code changes. For example, actual 10th line of schedule() can be moved easily by modifying schedule(), but the same line matching 'rq=cpu_rq*' may still exist in the function.) +FILTER PATTERN +-------------- + The filter pattern is a glob matching pattern(s) to filter variables. + In addition, you can use "!" for specifying filter-out rule. You also can give several rules combined with "&" or "|", and fold those rules as one rule by using "(" ")". + +e.g. + With --filter "foo* | bar*", perf probe -V shows variables which start with "foo" or "bar". + With --filter "!foo* & *bar", perf probe -V shows variables which don't start with "foo" and end with "bar", like "fizzbar". But "foobar" is filtered out. EXAMPLES -------- diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c index 6cf708aba7c9..abb423e164c8 100644 --- a/tools/perf/builtin-probe.c +++ b/tools/perf/builtin-probe.c @@ -36,6 +36,7 @@ #include "builtin.h" #include "util/util.h" #include "util/strlist.h" +#include "util/strfilter.h" #include "util/symbol.h" #include "util/debug.h" #include "util/debugfs.h" @@ -43,6 +44,7 @@ #include "util/probe-finder.h" #include "util/probe-event.h" +#define DEFAULT_VAR_FILTER "!__k???tab_* & !__crc_*" #define MAX_PATH_LEN 256 /* Session management structure */ @@ -60,6 +62,7 @@ static struct { struct line_range line_range; const char *target_module; int max_probe_points; + struct strfilter *filter; } params; /* Parse an event definition. Note that any error must die. */ @@ -156,6 +159,27 @@ static int opt_show_vars(const struct option *opt __used, return ret; } + +static int opt_set_filter(const struct option *opt __used, + const char *str, int unset __used) +{ + const char *err; + + if (str) { + pr_debug2("Set filter: %s\n", str); + if (params.filter) + strfilter__delete(params.filter); + params.filter = strfilter__new(str, &err); + if (!params.filter) { + pr_err("Filter parse error at %ld.\n", err - str + 1); + pr_err("Source: \"%s\"\n", str); + pr_err(" %*c\n", (int)(err - str + 1), '^'); + return -EINVAL; + } + } + + return 0; +} #endif static const char * const probe_usage[] = { @@ -212,6 +236,10 @@ static const struct option options[] = { "Show accessible variables on PROBEDEF", opt_show_vars), OPT_BOOLEAN('\0', "externs", ¶ms.show_ext_vars, "Show external variables too (with --vars only)"), + OPT_CALLBACK('\0', "filter", NULL, + "[!]FILTER", "Set a variable filter (with --vars only)\n" + "\t\t\t(default: \"" DEFAULT_VAR_FILTER "\")", + opt_set_filter), OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name, "file", "vmlinux pathname"), OPT_STRING('s', "source", &symbol_conf.source_prefix, @@ -324,10 +352,16 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used) " --add/--del.\n"); usage_with_options(probe_usage, options); } + if (!params.filter) + params.filter = strfilter__new(DEFAULT_VAR_FILTER, + NULL); + ret = show_available_vars(params.events, params.nevents, params.max_probe_points, params.target_module, + params.filter, params.show_ext_vars); + strfilter__delete(params.filter); if (ret < 0) pr_err(" Error: Failed to show vars. (%d)\n", ret); return ret; diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 859d377a3df3..077e0518f0f7 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -451,12 +451,14 @@ end: } static int show_available_vars_at(int fd, struct perf_probe_event *pev, - int max_vls, bool externs) + int max_vls, struct strfilter *_filter, + bool externs) { char *buf; - int ret, i; + int ret, i, nvars; struct str_node *node; struct variable_list *vls = NULL, *vl; + const char *var; buf = synthesize_perf_probe_point(&pev->point); if (!buf) @@ -464,36 +466,45 @@ static int show_available_vars_at(int fd, struct perf_probe_event *pev, pr_debug("Searching variables at %s\n", buf); ret = find_available_vars_at(fd, pev, &vls, max_vls, externs); - if (ret > 0) { - /* Some variables were found */ - fprintf(stdout, "Available variables at %s\n", buf); - for (i = 0; i < ret; i++) { - vl = &vls[i]; - /* - * A probe point might be converted to - * several trace points. - */ - fprintf(stdout, "\t@<%s+%lu>\n", vl->point.symbol, - vl->point.offset); - free(vl->point.symbol); - if (vl->vars) { - strlist__for_each(node, vl->vars) - fprintf(stdout, "\t\t%s\n", node->s); - strlist__delete(vl->vars); - } else - fprintf(stdout, "(No variables)\n"); - } - free(vls); - } else + if (ret <= 0) { pr_err("Failed to find variables at %s (%d)\n", buf, ret); - + goto end; + } + /* Some variables are found */ + fprintf(stdout, "Available variables at %s\n", buf); + for (i = 0; i < ret; i++) { + vl = &vls[i]; + /* + * A probe point might be converted to + * several trace points. + */ + fprintf(stdout, "\t@<%s+%lu>\n", vl->point.symbol, + vl->point.offset); + free(vl->point.symbol); + nvars = 0; + if (vl->vars) { + strlist__for_each(node, vl->vars) { + var = strchr(node->s, '\t') + 1; + if (strfilter__compare(_filter, var)) { + fprintf(stdout, "\t\t%s\n", node->s); + nvars++; + } + } + strlist__delete(vl->vars); + } + if (nvars == 0) + fprintf(stdout, "\t\t(No matched variables)\n"); + } + free(vls); +end: free(buf); return ret; } /* Show available variables on given probe point */ int show_available_vars(struct perf_probe_event *pevs, int npevs, - int max_vls, const char *module, bool externs) + int max_vls, const char *module, + struct strfilter *_filter, bool externs) { int i, fd, ret = 0; @@ -510,7 +521,8 @@ int show_available_vars(struct perf_probe_event *pevs, int npevs, setup_pager(); for (i = 0; i < npevs && ret >= 0; i++) - ret = show_available_vars_at(fd, &pevs[i], max_vls, externs); + ret = show_available_vars_at(fd, &pevs[i], max_vls, _filter, + externs); close(fd); return ret; @@ -556,7 +568,9 @@ int show_line_range(struct line_range *lr __unused, const char *module __unused) int show_available_vars(struct perf_probe_event *pevs __unused, int npevs __unused, int max_vls __unused, - const char *module __unused, bool externs __unused) + const char *module __unused, + struct strfilter *filter __unused, + bool externs __unused) { pr_warning("Debuginfo-analysis is not supported.\n"); return -ENOSYS; diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h index 1fb4f18337d3..4e80b2bbc516 100644 --- a/tools/perf/util/probe-event.h +++ b/tools/perf/util/probe-event.h @@ -3,6 +3,7 @@ #include #include "strlist.h" +#include "strfilter.h" extern bool probe_event_dry_run; @@ -126,7 +127,7 @@ extern int show_perf_probe_events(void); extern int show_line_range(struct line_range *lr, const char *module); extern int show_available_vars(struct perf_probe_event *pevs, int npevs, int max_probe_points, const char *module, - bool externs); + struct strfilter *filter, bool externs); extern int show_available_funcs(const char *module); From 3c42258c9a4db70133fa6946a275b62a16792bb5 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 20 Jan 2011 23:15:45 +0900 Subject: [PATCH 073/706] perf probe: Add filters support for available functions Add filters support for available function list. Default filter is "!_*" for filtering out local-purpose symbols. e.g.: # perf probe --filter="add*" -F add_disk add_disk_randomness add_input_randomness add_interrupt_randomness add_memory add_page_to_unevictable_list add_page_wait_queue ... Cc: 2nddept-manager@sdl.hitachi.co.jp Cc: Chase Douglas Cc: Franck Bui-Huu Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Steven Rostedt LKML-Reference: <20110120141545.25915.85930.stgit@ltc236.sdl.hitachi.co.jp> Signed-off-by: Masami Hiramatsu Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-probe.txt | 9 +++++---- tools/perf/builtin-probe.c | 19 +++++++++++++------ tools/perf/util/probe-event.c | 23 +++++++++++++---------- tools/perf/util/probe-event.h | 2 +- 4 files changed, 32 insertions(+), 21 deletions(-) diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt index 32fb18f1695d..81c3220e04f3 100644 --- a/tools/perf/Documentation/perf-probe.txt +++ b/tools/perf/Documentation/perf-probe.txt @@ -78,10 +78,11 @@ OPTIONS Show available functions in given module or kernel. --filter=FILTER:: - (Only for --vars) Set filter for variables. FILTER is a combination of - glob pattern, see FILTER PATTERN for details. - Default FILTER is "!__k???tab_* & !__crc_*". - If several filters are specified, only the last filter is valid. + (Only for --vars and --funcs) Set filter. FILTER is a combination of glob + pattern, see FILTER PATTERN for detail. + Default FILTER is "!__k???tab_* & !__crc_*" for --vars, and "!_*" + for --funcs. + If several filters are specified, only the last filter is used. -f:: --force:: diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c index abb423e164c8..fcde0031085f 100644 --- a/tools/perf/builtin-probe.c +++ b/tools/perf/builtin-probe.c @@ -45,6 +45,7 @@ #include "util/probe-event.h" #define DEFAULT_VAR_FILTER "!__k???tab_* & !__crc_*" +#define DEFAULT_FUNC_FILTER "!_*" #define MAX_PATH_LEN 256 /* Session management structure */ @@ -159,6 +160,7 @@ static int opt_show_vars(const struct option *opt __used, return ret; } +#endif static int opt_set_filter(const struct option *opt __used, const char *str, int unset __used) @@ -180,7 +182,6 @@ static int opt_set_filter(const struct option *opt __used, return 0; } -#endif static const char * const probe_usage[] = { "perf probe [] 'PROBEDEF' ['PROBEDEF' ...]", @@ -236,10 +237,6 @@ static const struct option options[] = { "Show accessible variables on PROBEDEF", opt_show_vars), OPT_BOOLEAN('\0', "externs", ¶ms.show_ext_vars, "Show external variables too (with --vars only)"), - OPT_CALLBACK('\0', "filter", NULL, - "[!]FILTER", "Set a variable filter (with --vars only)\n" - "\t\t\t(default: \"" DEFAULT_VAR_FILTER "\")", - opt_set_filter), OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name, "file", "vmlinux pathname"), OPT_STRING('s', "source", &symbol_conf.source_prefix, @@ -252,6 +249,11 @@ static const struct option options[] = { "Set how many probe points can be found for a probe."), OPT_BOOLEAN('F', "funcs", ¶ms.show_funcs, "Show potential probe-able functions."), + OPT_CALLBACK('\0', "filter", NULL, + "[!]FILTER", "Set a filter (with --vars/funcs only)\n" + "\t\t\t(default: \"" DEFAULT_VAR_FILTER "\" for --vars,\n" + "\t\t\t \"" DEFAULT_FUNC_FILTER "\" for --funcs)", + opt_set_filter), OPT_END() }; @@ -322,7 +324,12 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used) pr_err(" Error: Don't use --funcs with --vars.\n"); usage_with_options(probe_usage, options); } - ret = show_available_funcs(params.target_module); + if (!params.filter) + params.filter = strfilter__new(DEFAULT_FUNC_FILTER, + NULL); + ret = show_available_funcs(params.target_module, + params.filter); + strfilter__delete(params.filter); if (ret < 0) pr_err(" Error: Failed to show functions." " (%d)\n", ret); diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 077e0518f0f7..9d237e3cff5d 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -1951,21 +1951,23 @@ int del_perf_probe_events(struct strlist *dellist) return ret; } +/* TODO: don't use a global variable for filter ... */ +static struct strfilter *available_func_filter; /* - * If a symbol corresponds to a function with global binding return 0. - * For all others return 1. + * If a symbol corresponds to a function with global binding and + * matches filter return 0. For all others return 1. */ -static int filter_non_global_functions(struct map *map __unused, - struct symbol *sym) +static int filter_available_functions(struct map *map __unused, + struct symbol *sym) { - if (sym->binding != STB_GLOBAL) - return 1; - - return 0; + if (sym->binding == STB_GLOBAL && + strfilter__compare(available_func_filter, sym->name)) + return 0; + return 1; } -int show_available_funcs(const char *module) +int show_available_funcs(const char *module, struct strfilter *_filter) { struct map *map; int ret; @@ -1981,7 +1983,8 @@ int show_available_funcs(const char *module) pr_err("Failed to find %s map.\n", (module) ? : "kernel"); return -EINVAL; } - if (map__load(map, filter_non_global_functions)) { + available_func_filter = _filter; + if (map__load(map, filter_available_functions)) { pr_err("Failed to load map.\n"); return -EINVAL; } diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h index 4e80b2bbc516..3434fc9d79d5 100644 --- a/tools/perf/util/probe-event.h +++ b/tools/perf/util/probe-event.h @@ -128,7 +128,7 @@ extern int show_line_range(struct line_range *lr, const char *module); extern int show_available_vars(struct perf_probe_event *pevs, int npevs, int max_probe_points, const char *module, struct strfilter *filter, bool externs); -extern int show_available_funcs(const char *module); +extern int show_available_funcs(const char *module, struct strfilter *filter); /* Maximum index number of event-name postfix */ From 54489c189b1a0c10eaf21c6d2c5916b50442c871 Mon Sep 17 00:00:00 2001 From: Han Pingtian Date: Tue, 25 Jan 2011 07:39:00 +0800 Subject: [PATCH 074/706] perf test: Fix return values checking Fixing some cut'n'paste mistakes. LKML-Reference: <20110124233900.GA3443@epc900.nay.redhat.com> Signed-off-by: Han Pingtian [ committer note: I had already removed the CPU_ALLOC calls ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-test.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c index 231e3e21810c..738830d298e8 100644 --- a/tools/perf/builtin-test.c +++ b/tools/perf/builtin-test.c @@ -495,8 +495,8 @@ static int test__basic_mmap(void) } cpus = cpu_map__new(NULL); - if (threads == NULL) { - pr_debug("thread_map__new\n"); + if (cpus == NULL) { + pr_debug("cpu_map__new\n"); goto out_free_threads; } @@ -510,7 +510,7 @@ static int test__basic_mmap(void) } evlist = perf_evlist__new(); - if (threads == NULL) { + if (evlist == NULL) { pr_debug("perf_evlist__new\n"); goto out_free_cpus; } From bd22a2f1982fa3e90ce7d5d011c37d88aa67e73c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 23 Jan 2011 14:37:27 +0100 Subject: [PATCH 075/706] x86: Kill unused static boot_cpu_logical_apicid in smpboot.c Signed-off-by: Tejun Heo Reviewed-by: Pekka Enberg Acked-by: Yinghai Lu Cc: eric.dumazet@gmail.com Cc: yinghai@kernel.org Cc: brgerst@gmail.com Cc: gorcunov@gmail.com Cc: shaohui.zheng@intel.com Cc: rientjes@google.com LKML-Reference: <1295789862-25482-2-git-send-email-tj@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 0cbe8c0b35ed..ee9203a17d67 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -165,8 +165,6 @@ static void unmap_cpu_to_node(int cpu) #endif #ifdef CONFIG_X86_32 -static int boot_cpu_logical_apicid; - u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; @@ -1096,9 +1094,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) * Setup boot CPU information */ smp_store_cpu_info(0); /* Final full version of the data */ -#ifdef CONFIG_X86_32 - boot_cpu_logical_apicid = logical_smp_processor_id(); -#endif + current_thread_info()->cpu = 0; /* needed? */ for_each_possible_cpu(i) { zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); From b78aa66b1fe4179d28e3f6502dc179773519a1bb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 23 Jan 2011 14:37:28 +0100 Subject: [PATCH 076/706] x86: Drop x86_32 MAX_APICID Commit 56d91f13 (x86, acpi: Add MAX_LOCAL_APIC for 32bit) added MAX_LOCAL_APIC for x86_32 but didn't replace MAX_APICID users with it. Convert MAX_APICID users to MAX_LOCAL_APIC and drop MAX_APICID. Signed-off-by: Tejun Heo Reviewed-by: Pekka Enberg Acked-by: Yinghai Lu Cc: eric.dumazet@gmail.com Cc: yinghai@kernel.org Cc: brgerst@gmail.com Cc: gorcunov@gmail.com Cc: shaohui.zheng@intel.com Cc: rientjes@google.com LKML-Reference: <1295789862-25482-3-git-send-email-tj@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mpspec.h | 2 -- arch/x86/kernel/smpboot.c | 2 +- arch/x86/mm/srat_32.c | 4 ++-- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index 0c90dd9f0505..edc2a455b726 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -33,8 +33,6 @@ extern int mp_bus_id_to_local[MAX_MP_BUSSES]; extern int quad_local_to_mp_bus_id [NR_CPUS/4][4]; #endif -#define MAX_APICID 256 - #else /* CONFIG_X86_64: */ #define MAX_MP_BUSSES 256 diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ee9203a17d67..53a85baaecca 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -72,7 +72,7 @@ #include #ifdef CONFIG_X86_32 -u8 apicid_2_node[MAX_APICID]; +u8 apicid_2_node[MAX_LOCAL_APIC]; #endif /* State of each CPU */ diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c index ae96e7b8051d..6027a4810003 100644 --- a/arch/x86/mm/srat_32.c +++ b/arch/x86/mm/srat_32.c @@ -57,7 +57,7 @@ struct node_memory_chunk_s { static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS]; static int __initdata num_memory_chunks; /* total number of memory chunks */ -static u8 __initdata apicid_to_pxm[MAX_APICID]; +static u8 __initdata apicid_to_pxm[MAX_LOCAL_APIC]; int acpi_numa __initdata; @@ -254,7 +254,7 @@ int __init get_memcfg_from_srat(void) printk(KERN_DEBUG "Number of memory chunks in system = %d\n", num_memory_chunks); - for (i = 0; i < MAX_APICID; i++) + for (i = 0; i < MAX_LOCAL_APIC; i++) apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]); for (j = 0; j < num_memory_chunks; j++){ From 1245e1668c6e52bee76a423f8fab3bfcdd6226ae Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 23 Jan 2011 14:37:29 +0100 Subject: [PATCH 077/706] x86: Make default_send_IPI_mask_sequence/allbutself_logical() 32bit only Both functions are used only in 32bit. Put them inside CONFIG_X86_32. This is to prepare for logical apicid handling update. - Cyrill Gorcunov spotted that I forgot to move declarations in ipi.h under CONFIG_X86_32. Fixed. Signed-off-by: Tejun Heo Reviewed-by: Pekka Enberg Reviewed-by: Cyrill Gorcunov Acked-by: Yinghai Lu Cc: eric.dumazet@gmail.com Cc: brgerst@gmail.com Cc: shaohui.zheng@intel.com Cc: rientjes@google.com LKML-Reference: <1295789862-25482-4-git-send-email-tj@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ipi.h | 8 ++++---- arch/x86/kernel/apic/ipi.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h index 0b7228268a63..615fa9061b57 100644 --- a/arch/x86/include/asm/ipi.h +++ b/arch/x86/include/asm/ipi.h @@ -123,10 +123,6 @@ extern void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector); extern void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, int vector); -extern void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, - int vector); -extern void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, - int vector); /* Avoid include hell */ #define NMI_VECTOR 0x02 @@ -150,6 +146,10 @@ static inline void __default_local_send_IPI_all(int vector) } #ifdef CONFIG_X86_32 +extern void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, + int vector); +extern void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, + int vector); extern void default_send_IPI_mask_logical(const struct cpumask *mask, int vector); extern void default_send_IPI_allbutself(int vector); diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 08385e090a6f..5037736c460d 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -56,6 +56,8 @@ void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, local_irq_restore(flags); } +#ifdef CONFIG_X86_32 + void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector) { @@ -96,8 +98,6 @@ void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, local_irq_restore(flags); } -#ifdef CONFIG_X86_32 - /* * This is only used on smaller machines. */ From 4c321ff8a01a95badf5d5403d80ca4e0ab07fce7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 23 Jan 2011 14:37:30 +0100 Subject: [PATCH 078/706] x86: Replace cpu_2_logical_apicid[] with early percpu variable Unlike x86_64, on x86_32, the mapping from cpu to logical apicid may vary depending on apic in use. cpu_2_logical_apicid[] array is used for this mapping. Replace it with early percpu variable x86_cpu_to_logical_apicid to make it better aligned with other mappings. Signed-off-by: Tejun Heo Cc: eric.dumazet@gmail.com Cc: yinghai@kernel.org Cc: brgerst@gmail.com Cc: gorcunov@gmail.com Cc: penberg@kernel.org Cc: shaohui.zheng@intel.com Cc: rientjes@google.com LKML-Reference: <1295789862-25482-5-git-send-email-tj@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 4 ---- arch/x86/include/asm/smp.h | 3 +++ arch/x86/kernel/apic/apic.c | 11 +++++++++++ arch/x86/kernel/apic/es7000_32.c | 2 +- arch/x86/kernel/apic/numaq_32.c | 2 +- arch/x86/kernel/apic/summit_32.c | 4 ++-- arch/x86/kernel/setup_percpu.c | 7 +++++++ arch/x86/kernel/smpboot.c | 7 ++----- 8 files changed, 27 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 5e3969c36d7f..eb139eced850 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -595,8 +595,4 @@ extern int default_check_phys_apicid_present(int phys_apicid); #endif /* CONFIG_X86_LOCAL_APIC */ -#ifdef CONFIG_X86_32 -extern u8 cpu_2_logical_apicid[NR_CPUS]; -#endif - #endif /* _ASM_X86_APIC_H */ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 4c2f63c7fc1b..dc7c46a89db7 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -38,6 +38,9 @@ static inline struct cpumask *cpu_core_mask(int cpu) DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid); DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); +#if defined(CONFIG_SMP) && defined(CONFIG_X86_32) +DECLARE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid); +#endif /* Static state in head.S used to set up a CPU */ extern struct { diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 06c196d7e59c..126d5a3b00e9 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -78,6 +78,17 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); #ifdef CONFIG_X86_32 + +#ifdef CONFIG_SMP +/* + * On x86_32, the mapping between cpu and logical apicid may vary + * depending on apic in use. The following early percpu variable is + * used for the mapping. This is where the behaviors of x86_64 and 32 + * actually diverge. Let's keep it ugly for now. + */ +DEFINE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid, BAD_APICID); +#endif + /* * Knob to control our willingness to enable the local APIC. * diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 8593582d8022..7cb73e12f784 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -534,7 +534,7 @@ static int es7000_cpu_to_logical_apicid(int cpu) #ifdef CONFIG_SMP if (cpu >= nr_cpu_ids) return BAD_APICID; - return cpu_2_logical_apicid[cpu]; + return early_per_cpu(x86_cpu_to_logical_apicid, cpu); #else return logical_smp_processor_id(); #endif diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 960f26ab5c9f..4ed90c4882e9 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -377,7 +377,7 @@ static inline int numaq_cpu_to_logical_apicid(int cpu) { if (cpu >= nr_cpu_ids) return BAD_APICID; - return cpu_2_logical_apicid[cpu]; + return early_per_cpu(x86_cpu_to_logical_apicid, cpu); } /* diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 9b419263d90d..82cfc3ea70d1 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -206,7 +206,7 @@ static void summit_init_apic_ldr(void) /* Create logical APIC IDs by counting CPUs already in cluster. */ for (count = 0, i = nr_cpu_ids; --i >= 0; ) { - lid = cpu_2_logical_apicid[i]; + lid = early_per_cpu(x86_cpu_to_logical_apicid, i); if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster) ++count; } @@ -247,7 +247,7 @@ static inline int summit_cpu_to_logical_apicid(int cpu) #ifdef CONFIG_SMP if (cpu >= nr_cpu_ids) return BAD_APICID; - return cpu_2_logical_apicid[cpu]; + return early_per_cpu(x86_cpu_to_logical_apicid, cpu); #else return logical_smp_processor_id(); #endif diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 002b79685f73..b5147f00f0ef 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -225,6 +225,10 @@ void __init setup_per_cpu_areas(void) per_cpu(x86_bios_cpu_apicid, cpu) = early_per_cpu_map(x86_bios_cpu_apicid, cpu); #endif +#ifdef CONFIG_X86_32 + per_cpu(x86_cpu_to_logical_apicid, cpu) = + early_per_cpu_map(x86_cpu_to_logical_apicid, cpu); +#endif #ifdef CONFIG_X86_64 per_cpu(irq_stack_ptr, cpu) = per_cpu(irq_stack_union.irq_stack, cpu) + @@ -256,6 +260,9 @@ void __init setup_per_cpu_areas(void) early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; #endif +#ifdef CONFIG_X86_32 + early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL; +#endif #if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; #endif diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 53a85baaecca..df934e46bf53 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -165,9 +165,6 @@ static void unmap_cpu_to_node(int cpu) #endif #ifdef CONFIG_X86_32 -u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = - { [0 ... NR_CPUS-1] = BAD_APICID }; - static void map_cpu_to_logical_apicid(void) { int cpu = smp_processor_id(); @@ -177,13 +174,13 @@ static void map_cpu_to_logical_apicid(void) if (!node_online(node)) node = first_online_node; - cpu_2_logical_apicid[cpu] = apicid; + early_per_cpu(x86_cpu_to_logical_apicid, cpu) = apicid; map_cpu_to_node(cpu, node); } void numa_remove_cpu(int cpu) { - cpu_2_logical_apicid[cpu] = BAD_APICID; + early_per_cpu(x86_cpu_to_logical_apicid, cpu) = BAD_APICID; unmap_cpu_to_node(cpu); } #else From 6f802c4bfa2acf1bffa8341fe9084da0205d581d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 23 Jan 2011 14:37:31 +0100 Subject: [PATCH 079/706] x86: Always use x86_cpu_to_logical_apicid for cpu -> logical apic id Currently, cpu -> logical apic id translation is done by apic->cpu_to_logical_apicid() callback which may or may not use x86_cpu_to_logical_apicid. This is unnecessary as it should always equal logical_smp_processor_id() which is known early during CPU bring up. Initialize x86_cpu_to_logical_apicid after apic->init_apic_ldr() in setup_local_APIC() and always use x86_cpu_to_logical_apicid for cpu -> logical apic id mapping. Signed-off-by: Tejun Heo Cc: eric.dumazet@gmail.com Cc: yinghai@kernel.org Cc: brgerst@gmail.com Cc: gorcunov@gmail.com Cc: penberg@kernel.org Cc: shaohui.zheng@intel.com Cc: rientjes@google.com LKML-Reference: <1295789862-25482-6-git-send-email-tj@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 8 ++++++++ arch/x86/kernel/apic/ipi.c | 8 ++++---- arch/x86/kernel/smpboot.c | 7 +++---- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 126d5a3b00e9..ae08246f320c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1248,6 +1248,14 @@ void __cpuinit setup_local_APIC(void) */ apic->init_apic_ldr(); +#ifdef CONFIG_X86_32 + /* + * APIC LDR is initialized. Fetch and store logical_apic_id. + */ + early_per_cpu(x86_cpu_to_logical_apicid, cpu) = + logical_smp_processor_id(); +#endif + /* * Set Task Priority to 'accept all'. We never change this * later on. diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 5037736c460d..cce91bf26676 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -73,8 +73,8 @@ void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, local_irq_save(flags); for_each_cpu(query_cpu, mask) __default_send_IPI_dest_field( - apic->cpu_to_logical_apicid(query_cpu), vector, - apic->dest_logical); + early_per_cpu(x86_cpu_to_logical_apicid, query_cpu), + vector, apic->dest_logical); local_irq_restore(flags); } @@ -92,8 +92,8 @@ void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, if (query_cpu == this_cpu) continue; __default_send_IPI_dest_field( - apic->cpu_to_logical_apicid(query_cpu), vector, - apic->dest_logical); + early_per_cpu(x86_cpu_to_logical_apicid, query_cpu), + vector, apic->dest_logical); } local_irq_restore(flags); } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index df934e46bf53..ca20f6bee3be 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -168,19 +168,18 @@ static void unmap_cpu_to_node(int cpu) static void map_cpu_to_logical_apicid(void) { int cpu = smp_processor_id(); - int apicid = logical_smp_processor_id(); - int node = apic->apicid_to_node(apicid); + int logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); + int node; + node = apic->apicid_to_node(logical_apicid); if (!node_online(node)) node = first_online_node; - early_per_cpu(x86_cpu_to_logical_apicid, cpu) = apicid; map_cpu_to_node(cpu, node); } void numa_remove_cpu(int cpu) { - early_per_cpu(x86_cpu_to_logical_apicid, cpu) = BAD_APICID; unmap_cpu_to_node(cpu); } #else From 7632611f534340182c832d2b139cb19676f24e1a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 23 Jan 2011 14:37:32 +0100 Subject: [PATCH 080/706] x86: Kill apic->cpu_to_logical_apicid() After the previous patch, apic->cpu_to_logical_apicid() is no longer used. Kill it. For apic types with custom cpu_to_logical_apicid() which is also used for other purposes, remove the function and modify its users to do the mapping directly. #ifdef's on CONFIG_SMP in es7000_32 and summit_32 are ignored during conversion as they are not used for UP kernels. Signed-off-by: Tejun Heo Cc: eric.dumazet@gmail.com Cc: yinghai@kernel.org Cc: brgerst@gmail.com Cc: gorcunov@gmail.com Cc: penberg@kernel.org Cc: shaohui.zheng@intel.com Cc: rientjes@google.com LKML-Reference: <1295789862-25482-7-git-send-email-tj@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 7 ------- arch/x86/kernel/apic/apic_flat_64.c | 2 -- arch/x86/kernel/apic/apic_noop.c | 6 ------ arch/x86/kernel/apic/bigsmp_32.c | 19 +++++++------------ arch/x86/kernel/apic/es7000_32.c | 18 ++---------------- arch/x86/kernel/apic/numaq_32.c | 8 -------- arch/x86/kernel/apic/probe_32.c | 1 - arch/x86/kernel/apic/summit_32.c | 17 ++--------------- arch/x86/kernel/apic/x2apic_cluster.c | 1 - arch/x86/kernel/apic/x2apic_phys.c | 1 - arch/x86/kernel/apic/x2apic_uv_x.c | 1 - 11 files changed, 11 insertions(+), 70 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index eb139eced850..d1aa0c3e7a5c 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -307,7 +307,6 @@ struct apic { void (*setup_apic_routing)(void); int (*multi_timer_check)(int apic, int irq); int (*apicid_to_node)(int logical_apicid); - int (*cpu_to_logical_apicid)(int cpu); int (*cpu_present_to_apicid)(int mps_cpu); void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap); void (*setup_portio_remap)(void); @@ -557,12 +556,6 @@ static inline void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_ma *retmap = *phys_map; } -/* Mapping from cpu number to logical apicid */ -static inline int default_cpu_to_logical_apicid(int cpu) -{ - return 1 << cpu; -} - static inline int __default_cpu_present_to_apicid(int mps_cpu) { if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu)) diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 09d3b17ce0c2..5a9d11a94b55 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -186,7 +186,6 @@ struct apic apic_flat = { .setup_apic_routing = NULL, .multi_timer_check = NULL, .apicid_to_node = NULL, - .cpu_to_logical_apicid = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, .setup_portio_remap = NULL, @@ -338,7 +337,6 @@ struct apic apic_physflat = { .setup_apic_routing = NULL, .multi_timer_check = NULL, .apicid_to_node = NULL, - .cpu_to_logical_apicid = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, .setup_portio_remap = NULL, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index e31b9ffe25f5..f3d19b2426ab 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -54,11 +54,6 @@ static u64 noop_apic_icr_read(void) return 0; } -static int noop_cpu_to_logical_apicid(int cpu) -{ - return 0; -} - static int noop_phys_pkg_id(int cpuid_apic, int index_msb) { return 0; @@ -155,7 +150,6 @@ struct apic apic_noop = { .multi_timer_check = NULL, .apicid_to_node = noop_apicid_to_node, - .cpu_to_logical_apicid = noop_cpu_to_logical_apicid, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = physid_set_mask_of_physid, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index cb804c5091b9..4c62592d6869 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -93,14 +93,6 @@ static int bigsmp_cpu_present_to_apicid(int mps_cpu) return BAD_APICID; } -/* Mapping from cpu number to logical apicid */ -static inline int bigsmp_cpu_to_logical_apicid(int cpu) -{ - if (cpu >= nr_cpu_ids) - return BAD_APICID; - return cpu_physical_id(cpu); -} - static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) { /* For clustered we don't have a good way to do this yet - hack */ @@ -115,7 +107,11 @@ static int bigsmp_check_phys_apicid_present(int phys_apicid) /* As we are using single CPU as destination, pick only one CPU here */ static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask) { - return bigsmp_cpu_to_logical_apicid(cpumask_first(cpumask)); + int cpu = cpumask_first(cpumask); + + if (cpu < nr_cpu_ids) + return cpu_physical_id(cpu); + return BAD_APICID; } static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask, @@ -129,9 +125,9 @@ static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask, */ for_each_cpu_and(cpu, cpumask, andmask) { if (cpumask_test_cpu(cpu, cpu_online_mask)) - break; + return cpu_physical_id(cpu); } - return bigsmp_cpu_to_logical_apicid(cpu); + return BAD_APICID; } static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) @@ -220,7 +216,6 @@ struct apic apic_bigsmp = { .setup_apic_routing = bigsmp_setup_apic_routing, .multi_timer_check = NULL, .apicid_to_node = bigsmp_apicid_to_node, - .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid, .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, .apicid_to_cpu_present = physid_set_mask_of_physid, .setup_portio_remap = NULL, diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 7cb73e12f784..6840681a3f14 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -528,18 +528,6 @@ static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap) ++cpu_id; } -/* Mapping from cpu number to logical apicid */ -static int es7000_cpu_to_logical_apicid(int cpu) -{ -#ifdef CONFIG_SMP - if (cpu >= nr_cpu_ids) - return BAD_APICID; - return early_per_cpu(x86_cpu_to_logical_apicid, cpu); -#else - return logical_smp_processor_id(); -#endif -} - static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) { /* For clustered we don't have a good way to do this yet - hack */ @@ -561,7 +549,7 @@ static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask) * The cpus in the mask must all be on the apic cluster. */ for_each_cpu(cpu, cpumask) { - int new_apicid = es7000_cpu_to_logical_apicid(cpu); + int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { WARN(1, "Not a valid mask!"); @@ -578,7 +566,7 @@ static unsigned int es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask, const struct cpumask *andmask) { - int apicid = es7000_cpu_to_logical_apicid(0); + int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); cpumask_var_t cpumask; if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) @@ -656,7 +644,6 @@ struct apic __refdata apic_es7000_cluster = { .setup_apic_routing = es7000_setup_apic_routing, .multi_timer_check = NULL, .apicid_to_node = es7000_apicid_to_node, - .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid, .cpu_present_to_apicid = es7000_cpu_present_to_apicid, .apicid_to_cpu_present = es7000_apicid_to_cpu_present, .setup_portio_remap = NULL, @@ -721,7 +708,6 @@ struct apic __refdata apic_es7000 = { .setup_apic_routing = es7000_setup_apic_routing, .multi_timer_check = NULL, .apicid_to_node = es7000_apicid_to_node, - .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid, .cpu_present_to_apicid = es7000_cpu_present_to_apicid, .apicid_to_cpu_present = es7000_apicid_to_cpu_present, .setup_portio_remap = NULL, diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 4ed90c4882e9..2b434d579e15 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -373,13 +373,6 @@ static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask return physids_promote(0xFUL, retmap); } -static inline int numaq_cpu_to_logical_apicid(int cpu) -{ - if (cpu >= nr_cpu_ids) - return BAD_APICID; - return early_per_cpu(x86_cpu_to_logical_apicid, cpu); -} - /* * Supporting over 60 cpus on NUMA-Q requires a locality-dependent * cpu to APIC ID relation to properly interact with the intelligent @@ -509,7 +502,6 @@ struct apic __refdata apic_numaq = { .setup_apic_routing = numaq_setup_apic_routing, .multi_timer_check = numaq_multi_timer_check, .apicid_to_node = numaq_apicid_to_node, - .cpu_to_logical_apicid = numaq_cpu_to_logical_apicid, .cpu_present_to_apicid = numaq_cpu_present_to_apicid, .apicid_to_cpu_present = numaq_apicid_to_cpu_present, .setup_portio_remap = numaq_setup_portio_remap, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 99d2fe016084..24a68281101b 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -131,7 +131,6 @@ struct apic apic_default = { .setup_apic_routing = setup_apic_flat_routing, .multi_timer_check = NULL, .apicid_to_node = default_apicid_to_node, - .cpu_to_logical_apicid = default_cpu_to_logical_apicid, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = physid_set_mask_of_physid, .setup_portio_remap = NULL, diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 82cfc3ea70d1..1ef4c14f4d6b 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -241,18 +241,6 @@ static int summit_apicid_to_node(int logical_apicid) #endif } -/* Mapping from cpu number to logical apicid */ -static inline int summit_cpu_to_logical_apicid(int cpu) -{ -#ifdef CONFIG_SMP - if (cpu >= nr_cpu_ids) - return BAD_APICID; - return early_per_cpu(x86_cpu_to_logical_apicid, cpu); -#else - return logical_smp_processor_id(); -#endif -} - static int summit_cpu_present_to_apicid(int mps_cpu) { if (mps_cpu < nr_cpu_ids) @@ -286,7 +274,7 @@ static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask) * The cpus in the mask must all be on the apic cluster. */ for_each_cpu(cpu, cpumask) { - int new_apicid = summit_cpu_to_logical_apicid(cpu); + int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { printk("%s: Not a valid mask!\n", __func__); @@ -301,7 +289,7 @@ static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask) static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, const struct cpumask *andmask) { - int apicid = summit_cpu_to_logical_apicid(0); + int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); cpumask_var_t cpumask; if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) @@ -529,7 +517,6 @@ struct apic apic_summit = { .setup_apic_routing = summit_setup_apic_routing, .multi_timer_check = NULL, .apicid_to_node = summit_apicid_to_node, - .cpu_to_logical_apicid = summit_cpu_to_logical_apicid, .cpu_present_to_apicid = summit_cpu_present_to_apicid, .apicid_to_cpu_present = summit_apicid_to_cpu_present, .setup_portio_remap = NULL, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index cf69c59f4910..badc1fdbea27 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -207,7 +207,6 @@ struct apic apic_x2apic_cluster = { .setup_apic_routing = NULL, .multi_timer_check = NULL, .apicid_to_node = NULL, - .cpu_to_logical_apicid = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, .setup_portio_remap = NULL, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 8972f38c5ced..f28bf4c5faf2 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -196,7 +196,6 @@ struct apic apic_x2apic_phys = { .setup_apic_routing = NULL, .multi_timer_check = NULL, .apicid_to_node = NULL, - .cpu_to_logical_apicid = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, .setup_portio_remap = NULL, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index bd16b58b8850..60276206b725 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -339,7 +339,6 @@ struct apic __refdata apic_x2apic_uv_x = { .setup_apic_routing = NULL, .multi_timer_check = NULL, .apicid_to_node = NULL, - .cpu_to_logical_apicid = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, .setup_portio_remap = NULL, From acb8bc09c6185e4d3d582d0076aaa6a89f19d8c5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 23 Jan 2011 14:37:33 +0100 Subject: [PATCH 081/706] x86: Add apic->x86_32_early_logical_apicid() On x86_32, the mapping between cpu and logical apic ID differs depending on the specific apic implementation in use. The mapping is initialized while bringing up CPUs; however, this makes early inits ignore memory topology. Add a x86_32 specific apic->x86_32_early_logical_apicid() which is called early during boot to query the mapping. The mapping is later verified against the result of init_apic_ldr(). The method is allowed to return BAD_APICID if it can't be determined early. noop variant which always returns BAD_APICID is implemented and added to all x86_32 apic implementations. Signed-off-by: Tejun Heo Cc: eric.dumazet@gmail.com Cc: yinghai@kernel.org Cc: brgerst@gmail.com Cc: gorcunov@gmail.com Cc: penberg@kernel.org Cc: shaohui.zheng@intel.com Cc: rientjes@google.com LKML-Reference: <1295789862-25482-8-git-send-email-tj@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 19 +++++++++++++++++++ arch/x86/kernel/apic/apic.c | 12 ++++++++++-- arch/x86/kernel/apic/apic_noop.c | 4 ++++ arch/x86/kernel/apic/bigsmp_32.c | 2 ++ arch/x86/kernel/apic/es7000_32.c | 4 ++++ arch/x86/kernel/apic/numaq_32.c | 2 ++ arch/x86/kernel/apic/probe_32.c | 2 ++ arch/x86/kernel/apic/summit_32.c | 2 ++ 8 files changed, 45 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index d1aa0c3e7a5c..efb073b5c743 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -354,6 +354,20 @@ struct apic { void (*icr_write)(u32 low, u32 high); void (*wait_icr_idle)(void); u32 (*safe_wait_icr_idle)(void); + +#ifdef CONFIG_X86_32 + /* + * Called very early during boot from get_smp_config(). It should + * return the logical apicid. x86_[bios]_cpu_to_apicid is + * initialized before this function is called. + * + * If logical apicid can't be determined that early, the function + * may return BAD_APICID. Logical apicid will be configured after + * init_apic_ldr() while bringing up CPUs. Note that NUMA affinity + * won't be applied properly during early boot in this case. + */ + int (*x86_32_early_logical_apicid)(int cpu); +#endif }; /* @@ -501,6 +515,11 @@ extern struct apic apic_noop; extern struct apic apic_default; +static inline int noop_x86_32_early_logical_apicid(int cpu) +{ + return BAD_APICID; +} + /* * Set up the logical destination ID. * diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index ae08246f320c..3127079628e8 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1250,8 +1250,13 @@ void __cpuinit setup_local_APIC(void) #ifdef CONFIG_X86_32 /* - * APIC LDR is initialized. Fetch and store logical_apic_id. + * APIC LDR is initialized. If logical_apicid mapping was + * initialized during get_smp_config(), make sure it matches the + * actual value. */ + i = early_per_cpu(x86_cpu_to_logical_apicid, cpu); + WARN_ON(i != BAD_APICID && i != logical_smp_processor_id()); + /* always use the value from LDR */ early_per_cpu(x86_cpu_to_logical_apicid, cpu) = logical_smp_processor_id(); #endif @@ -1991,7 +1996,10 @@ void __cpuinit generic_processor_info(int apicid, int version) early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid; #endif - +#ifdef CONFIG_X86_32 + early_per_cpu(x86_cpu_to_logical_apicid, cpu) = + apic->x86_32_early_logical_apicid(cpu); +#endif set_cpu_possible(cpu, true); set_cpu_present(cpu, true); } diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index f3d19b2426ab..0309c58d96bc 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -191,4 +191,8 @@ struct apic apic_noop = { .icr_write = noop_apic_icr_write, .wait_icr_idle = noop_apic_wait_icr_idle, .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle, + +#ifdef CONFIG_X86_32 + .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, +#endif }; diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 4c62592d6869..dd32a9b78a87 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -251,4 +251,6 @@ struct apic apic_bigsmp = { .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + + .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, }; diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 6840681a3f14..0ffc1eca5777 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -682,6 +682,8 @@ struct apic __refdata apic_es7000_cluster = { .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + + .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, }; struct apic __refdata apic_es7000 = { @@ -744,4 +746,6 @@ struct apic __refdata apic_es7000 = { .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + + .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, }; diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 2b434d579e15..f1a8b120c49d 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -539,4 +539,6 @@ struct apic __refdata apic_numaq = { .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + + .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, }; diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 24a68281101b..40be7c3cdfeb 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -166,6 +166,8 @@ struct apic apic_default = { .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + + .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, }; extern struct apic apic_numaq; diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 1ef4c14f4d6b..172c498e888f 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -552,4 +552,6 @@ struct apic apic_summit = { .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + + .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, }; From 3f6f6798889d50ec7ca8eef1d100cda37dc658ea Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 23 Jan 2011 14:37:34 +0100 Subject: [PATCH 082/706] x86: Implement the default x86_32_early_logical_apicid() Implement x86_32_early_logical_apicid() for the default apic flat routing. Signed-off-by: Tejun Heo Cc: eric.dumazet@gmail.com Cc: yinghai@kernel.org Cc: brgerst@gmail.com Cc: gorcunov@gmail.com Cc: penberg@kernel.org Cc: shaohui.zheng@intel.com Cc: rientjes@google.com LKML-Reference: <1295789862-25482-9-git-send-email-tj@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/probe_32.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 40be7c3cdfeb..0f9a9ab49e79 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -77,6 +77,11 @@ void __init default_setup_apic_routing(void) apic->setup_apic_routing(); } +static int default_x86_32_early_logical_apicid(int cpu) +{ + return 1 << cpu; +} + static void setup_apic_flat_routing(void) { #ifdef CONFIG_X86_IO_APIC @@ -167,7 +172,7 @@ struct apic apic_default = { .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, - .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, + .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid, }; extern struct apic apic_numaq; From 12bf24a47c1a095233cc8a8b863b509a0d8e0f2c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 23 Jan 2011 14:37:35 +0100 Subject: [PATCH 083/706] x86: Implement x86_32_early_logical_apicid() for bigsmp_32 Signed-off-by: Tejun Heo Cc: eric.dumazet@gmail.com Cc: yinghai@kernel.org Cc: brgerst@gmail.com Cc: gorcunov@gmail.com Cc: penberg@kernel.org Cc: shaohui.zheng@intel.com Cc: rientjes@google.com LKML-Reference: <1295789862-25482-10-git-send-email-tj@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/bigsmp_32.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index dd32a9b78a87..bc7ed040bb0e 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -45,6 +45,12 @@ static unsigned long bigsmp_check_apicid_present(int bit) return 1; } +static int bigsmp_early_logical_apicid(int cpu) +{ + /* on bigsmp, logical apicid is the same as physical */ + return early_per_cpu(x86_cpu_to_apicid, cpu); +} + static inline unsigned long calculate_ldr(int cpu) { unsigned long val, id; @@ -252,5 +258,5 @@ struct apic apic_bigsmp = { .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, - .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, + .x86_32_early_logical_apicid = bigsmp_early_logical_apicid, }; From 3b39d937843e071c59b3aeecbf7de4750f095b12 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 23 Jan 2011 14:37:36 +0100 Subject: [PATCH 084/706] x86: Implement x86_32_early_logical_apicid() for summit_32 Factor out logical apic id calculation from summit_init_apic_ldr() and use it for the x86_32_early_logical_apicid() callback. Signed-off-by: Tejun Heo Cc: eric.dumazet@gmail.com Cc: yinghai@kernel.org Cc: brgerst@gmail.com Cc: gorcunov@gmail.com Cc: penberg@kernel.org Cc: shaohui.zheng@intel.com Cc: rientjes@google.com LKML-Reference: <1295789862-25482-11-git-send-email-tj@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/summit_32.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 172c498e888f..8c9147321616 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -194,11 +194,10 @@ static unsigned long summit_check_apicid_present(int bit) return 1; } -static void summit_init_apic_ldr(void) +static int summit_early_logical_apicid(int cpu) { - unsigned long val, id; int count = 0; - u8 my_id = (u8)hard_smp_processor_id(); + u8 my_id = early_per_cpu(x86_cpu_to_apicid, cpu); u8 my_cluster = APIC_CLUSTER(my_id); #ifdef CONFIG_SMP u8 lid; @@ -214,7 +213,15 @@ static void summit_init_apic_ldr(void) /* We only have a 4 wide bitmap in cluster mode. If a deranged * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */ BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT); - id = my_cluster | (1UL << count); + return my_cluster | (1UL << count); +} + +static void summit_init_apic_ldr(void) +{ + int cpu = smp_processor_id(); + unsigned long id = early_per_cpu(x86_cpu_to_logical_apicid, cpu); + unsigned long val; + apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE); val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; val |= SET_APIC_LOGICAL_ID(id); @@ -553,5 +560,5 @@ struct apic apic_summit = { .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, - .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, + .x86_32_early_logical_apicid = summit_early_logical_apicid, }; From df04cf011b0657ddc782b48d455f7e232b9be41c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 23 Jan 2011 14:37:37 +0100 Subject: [PATCH 085/706] x86: Implement x86_32_early_logical_apicid() for numaq_32 Signed-off-by: Tejun Heo Cc: eric.dumazet@gmail.com Cc: yinghai@kernel.org Cc: brgerst@gmail.com Cc: gorcunov@gmail.com Cc: penberg@kernel.org Cc: shaohui.zheng@intel.com Cc: rientjes@google.com LKML-Reference: <1295789862-25482-12-git-send-email-tj@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/es7000_32.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 0ffc1eca5777..5c53d053ada5 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -460,6 +460,12 @@ static unsigned long es7000_check_apicid_present(int bit) return physid_isset(bit, phys_cpu_present_map); } +static int es7000_early_logical_apicid(int cpu) +{ + /* on es7000, logical apicid is the same as physical */ + return early_per_cpu(x86_bios_cpu_apicid, cpu); +} + static unsigned long calculate_ldr(int cpu) { unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu); @@ -683,7 +689,7 @@ struct apic __refdata apic_es7000_cluster = { .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, - .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, + .x86_32_early_logical_apicid = es7000_early_logical_apicid, }; struct apic __refdata apic_es7000 = { @@ -747,5 +753,5 @@ struct apic __refdata apic_es7000 = { .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, - .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, + .x86_32_early_logical_apicid = es7000_early_logical_apicid, }; From 89e5dc218e084e13a3996db6693b01478912f4ee Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 23 Jan 2011 14:37:38 +0100 Subject: [PATCH 086/706] x86: Replace apic->apicid_to_node() with ->x86_32_numa_cpu_node() apic->apicid_to_node() is 32bit specific apic operation which determines NUMA node for a CPU. Depending on the APIC implementation, it can be easier to determine NUMA node from either physical or logical apicid. Currently, ->apicid_to_node() takes @logical_apicid and calls hard_smp_processor_id() if the physical apicid is needed. This prevents NUMA mapping from being queried from a different CPU, which in turn makes it impossible to initialize NUMA mapping before SMP bringup. This patch replaces apic->apicid_to_node() with ->x86_32_numa_cpu_node() which takes @cpu, from which both logical and physical apicids can easily be determined. While at it, drop duplicate implementations from bigsmp_32 and summit_32, and use the default one. Signed-off-by: Tejun Heo Reviewed-by: Pekka Enberg Cc: eric.dumazet@gmail.com Cc: yinghai@kernel.org Cc: brgerst@gmail.com Cc: gorcunov@gmail.com Cc: shaohui.zheng@intel.com Cc: rientjes@google.com LKML-Reference: <1295789862-25482-13-git-send-email-tj@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 6 ++++-- arch/x86/kernel/apic/apic.c | 10 +++++++--- arch/x86/kernel/apic/apic_flat_64.c | 2 -- arch/x86/kernel/apic/apic_noop.c | 16 +++++++++------- arch/x86/kernel/apic/bigsmp_32.c | 7 +------ arch/x86/kernel/apic/es7000_32.c | 7 +++---- arch/x86/kernel/apic/numaq_32.c | 11 ++++++++++- arch/x86/kernel/apic/probe_32.c | 2 +- arch/x86/kernel/apic/summit_32.c | 11 +---------- arch/x86/kernel/apic/x2apic_cluster.c | 1 - arch/x86/kernel/apic/x2apic_phys.c | 1 - arch/x86/kernel/apic/x2apic_uv_x.c | 1 - arch/x86/kernel/smpboot.c | 3 +-- 13 files changed, 37 insertions(+), 41 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index efb073b5c743..ad30ca4b6fe9 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -306,7 +306,6 @@ struct apic { void (*setup_apic_routing)(void); int (*multi_timer_check)(int apic, int irq); - int (*apicid_to_node)(int logical_apicid); int (*cpu_present_to_apicid)(int mps_cpu); void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap); void (*setup_portio_remap)(void); @@ -367,6 +366,9 @@ struct apic { * won't be applied properly during early boot in this case. */ int (*x86_32_early_logical_apicid)(int cpu); + + /* determine CPU -> NUMA node mapping */ + int (*x86_32_numa_cpu_node)(int cpu); #endif }; @@ -539,7 +541,7 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb) return cpuid_apic >> index_msb; } -extern int default_apicid_to_node(int logical_apicid); +extern int default_x86_32_numa_cpu_node(int cpu); #endif diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 3127079628e8..0f4f3c152311 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2020,10 +2020,14 @@ void default_init_apic_ldr(void) } #ifdef CONFIG_X86_32 -int default_apicid_to_node(int logical_apicid) +int default_x86_32_numa_cpu_node(int cpu) { -#ifdef CONFIG_SMP - return apicid_2_node[hard_smp_processor_id()]; +#ifdef CONFIG_NUMA + int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); + + if (apicid != BAD_APICID) + return apicid_2_node[apicid]; + return NUMA_NO_NODE; #else return 0; #endif diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 5a9d11a94b55..5652d31fe108 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -185,7 +185,6 @@ struct apic apic_flat = { .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .multi_timer_check = NULL, - .apicid_to_node = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, .setup_portio_remap = NULL, @@ -336,7 +335,6 @@ struct apic apic_physflat = { .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .multi_timer_check = NULL, - .apicid_to_node = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, .setup_portio_remap = NULL, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 0309c58d96bc..f1baa2dc087a 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -108,12 +108,6 @@ static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask) cpumask_set_cpu(cpu, retmask); } -int noop_apicid_to_node(int logical_apicid) -{ - /* we're always on node 0 */ - return 0; -} - static u32 noop_apic_read(u32 reg) { WARN_ON_ONCE((cpu_has_apic && !disable_apic)); @@ -125,6 +119,14 @@ static void noop_apic_write(u32 reg, u32 v) WARN_ON_ONCE(cpu_has_apic && !disable_apic); } +#ifdef CONFIG_X86_32 +static int noop_x86_32_numa_cpu_node(int cpu) +{ + /* we're always on node 0 */ + return 0; +} +#endif + struct apic apic_noop = { .name = "noop", .probe = noop_probe, @@ -148,7 +150,6 @@ struct apic apic_noop = { .ioapic_phys_id_map = default_ioapic_phys_id_map, .setup_apic_routing = NULL, .multi_timer_check = NULL, - .apicid_to_node = noop_apicid_to_node, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = physid_set_mask_of_physid, @@ -194,5 +195,6 @@ struct apic apic_noop = { #ifdef CONFIG_X86_32 .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, + .x86_32_numa_cpu_node = noop_x86_32_numa_cpu_node, #endif }; diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index bc7ed040bb0e..541a2e431659 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -86,11 +86,6 @@ static void bigsmp_setup_apic_routing(void) nr_ioapics); } -static int bigsmp_apicid_to_node(int logical_apicid) -{ - return apicid_2_node[hard_smp_processor_id()]; -} - static int bigsmp_cpu_present_to_apicid(int mps_cpu) { if (mps_cpu < nr_cpu_ids) @@ -221,7 +216,6 @@ struct apic apic_bigsmp = { .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, .setup_apic_routing = bigsmp_setup_apic_routing, .multi_timer_check = NULL, - .apicid_to_node = bigsmp_apicid_to_node, .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, .apicid_to_cpu_present = physid_set_mask_of_physid, .setup_portio_remap = NULL, @@ -259,4 +253,5 @@ struct apic apic_bigsmp = { .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, .x86_32_early_logical_apicid = bigsmp_early_logical_apicid, + .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node, }; diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 5c53d053ada5..3e9de4854c5b 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -510,12 +510,11 @@ static void es7000_setup_apic_routing(void) nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); } -static int es7000_apicid_to_node(int logical_apicid) +static int es7000_numa_cpu_node(int cpu) { return 0; } - static int es7000_cpu_present_to_apicid(int mps_cpu) { if (!mps_cpu) @@ -649,7 +648,6 @@ struct apic __refdata apic_es7000_cluster = { .ioapic_phys_id_map = es7000_ioapic_phys_id_map, .setup_apic_routing = es7000_setup_apic_routing, .multi_timer_check = NULL, - .apicid_to_node = es7000_apicid_to_node, .cpu_present_to_apicid = es7000_cpu_present_to_apicid, .apicid_to_cpu_present = es7000_apicid_to_cpu_present, .setup_portio_remap = NULL, @@ -690,6 +688,7 @@ struct apic __refdata apic_es7000_cluster = { .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, .x86_32_early_logical_apicid = es7000_early_logical_apicid, + .x86_32_numa_cpu_node = es7000_numa_cpu_node, }; struct apic __refdata apic_es7000 = { @@ -715,7 +714,6 @@ struct apic __refdata apic_es7000 = { .ioapic_phys_id_map = es7000_ioapic_phys_id_map, .setup_apic_routing = es7000_setup_apic_routing, .multi_timer_check = NULL, - .apicid_to_node = es7000_apicid_to_node, .cpu_present_to_apicid = es7000_cpu_present_to_apicid, .apicid_to_cpu_present = es7000_apicid_to_cpu_present, .setup_portio_remap = NULL, @@ -754,4 +752,5 @@ struct apic __refdata apic_es7000 = { .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, .x86_32_early_logical_apicid = es7000_early_logical_apicid, + .x86_32_numa_cpu_node = es7000_numa_cpu_node, }; diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index f1a8b120c49d..6273eee5134b 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -391,6 +391,15 @@ static inline int numaq_apicid_to_node(int logical_apicid) return logical_apicid >> 4; } +static int numaq_numa_cpu_node(int cpu) +{ + int logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); + + if (logical_apicid != BAD_APICID) + return numaq_apicid_to_node(logical_apicid); + return NUMA_NO_NODE; +} + static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap) { int node = numaq_apicid_to_node(logical_apicid); @@ -501,7 +510,6 @@ struct apic __refdata apic_numaq = { .ioapic_phys_id_map = numaq_ioapic_phys_id_map, .setup_apic_routing = numaq_setup_apic_routing, .multi_timer_check = numaq_multi_timer_check, - .apicid_to_node = numaq_apicid_to_node, .cpu_present_to_apicid = numaq_cpu_present_to_apicid, .apicid_to_cpu_present = numaq_apicid_to_cpu_present, .setup_portio_remap = numaq_setup_portio_remap, @@ -541,4 +549,5 @@ struct apic __refdata apic_numaq = { .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, + .x86_32_numa_cpu_node = numaq_numa_cpu_node, }; diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 0f9a9ab49e79..fc84c7b61108 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -135,7 +135,6 @@ struct apic apic_default = { .ioapic_phys_id_map = default_ioapic_phys_id_map, .setup_apic_routing = setup_apic_flat_routing, .multi_timer_check = NULL, - .apicid_to_node = default_apicid_to_node, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = physid_set_mask_of_physid, .setup_portio_remap = NULL, @@ -173,6 +172,7 @@ struct apic apic_default = { .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid, + .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node, }; extern struct apic apic_numaq; diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 8c9147321616..e4b8059b414a 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -239,15 +239,6 @@ static void summit_setup_apic_routing(void) nr_ioapics); } -static int summit_apicid_to_node(int logical_apicid) -{ -#ifdef CONFIG_SMP - return apicid_2_node[hard_smp_processor_id()]; -#else - return 0; -#endif -} - static int summit_cpu_present_to_apicid(int mps_cpu) { if (mps_cpu < nr_cpu_ids) @@ -523,7 +514,6 @@ struct apic apic_summit = { .ioapic_phys_id_map = summit_ioapic_phys_id_map, .setup_apic_routing = summit_setup_apic_routing, .multi_timer_check = NULL, - .apicid_to_node = summit_apicid_to_node, .cpu_present_to_apicid = summit_cpu_present_to_apicid, .apicid_to_cpu_present = summit_apicid_to_cpu_present, .setup_portio_remap = NULL, @@ -561,4 +551,5 @@ struct apic apic_summit = { .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, .x86_32_early_logical_apicid = summit_early_logical_apicid, + .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node, }; diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index badc1fdbea27..90949bbd566d 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -206,7 +206,6 @@ struct apic apic_x2apic_cluster = { .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .multi_timer_check = NULL, - .apicid_to_node = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, .setup_portio_remap = NULL, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index f28bf4c5faf2..c7e6d6645bf4 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -195,7 +195,6 @@ struct apic apic_x2apic_phys = { .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .multi_timer_check = NULL, - .apicid_to_node = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, .setup_portio_remap = NULL, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 60276206b725..3c289281394c 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -338,7 +338,6 @@ struct apic __refdata apic_x2apic_uv_x = { .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .multi_timer_check = NULL, - .apicid_to_node = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, .setup_portio_remap = NULL, diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ca20f6bee3be..5319cdd53765 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -168,10 +168,9 @@ static void unmap_cpu_to_node(int cpu) static void map_cpu_to_logical_apicid(void) { int cpu = smp_processor_id(); - int logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); int node; - node = apic->apicid_to_node(logical_apicid); + node = apic->x86_32_numa_cpu_node(cpu); if (!node_online(node)) node = first_online_node; From bbc9e2f452d9c4b166d1f9a78d941d80173312fe Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 23 Jan 2011 14:37:39 +0100 Subject: [PATCH 087/706] x86: Unify cpu/apicid <-> NUMA node mapping between 32 and 64bit The mapping between cpu/apicid and node is done via apicid_to_node[] on 64bit and apicid_2_node[] + apic->x86_32_numa_cpu_node() on 32bit. This difference makes it difficult to further unify 32 and 64bit NUMA handling. This patch unifies it by replacing both apicid_to_node[] and apicid_2_node[] with __apicid_to_node[] array, which is accessed by two accessors - set_apicid_to_node() and numa_cpu_node(). On 64bit, numa_cpu_node() always consults __apicid_to_node[] directly while 32bit goes through apic->numa_cpu_node() method to allow apic implementations to override it. srat_detect_node() for amd cpus contains workaround for broken NUMA configuration which assumes relationship between APIC ID, HT node ID and NUMA topology. Leave it to access __apicid_to_node[] directly as mapping through CPU might result in undesirable behavior change. The comment is reformatted and updated to note the ugliness. Signed-off-by: Tejun Heo Reviewed-by: Pekka Enberg Cc: eric.dumazet@gmail.com Cc: yinghai@kernel.org Cc: brgerst@gmail.com Cc: gorcunov@gmail.com Cc: shaohui.zheng@intel.com Cc: rientjes@google.com LKML-Reference: <1295789862-25482-14-git-send-email-tj@kernel.org> Signed-off-by: Ingo Molnar Cc: David Rientjes --- arch/x86/include/asm/mpspec.h | 1 - arch/x86/include/asm/numa.h | 28 ++++++++++++++++++++ arch/x86/include/asm/numa_32.h | 6 +++++ arch/x86/include/asm/numa_64.h | 5 ++-- arch/x86/kernel/acpi/boot.c | 3 +-- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/cpu/amd.c | 47 ++++++++++++++++++++++------------ arch/x86/kernel/cpu/intel.c | 3 +-- arch/x86/kernel/smpboot.c | 6 +---- arch/x86/mm/amdtopology_64.c | 4 +-- arch/x86/mm/numa.c | 6 ++++- arch/x86/mm/numa_32.c | 6 +++++ arch/x86/mm/numa_64.c | 26 ++++++++----------- arch/x86/mm/srat_32.c | 2 +- arch/x86/mm/srat_64.c | 12 ++++----- 15 files changed, 101 insertions(+), 56 deletions(-) diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index edc2a455b726..9c7d95f6174b 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -25,7 +25,6 @@ extern int pic_mode; #define MAX_IRQ_SOURCES 256 extern unsigned int def_to_bigsmp; -extern u8 apicid_2_node[]; #ifdef CONFIG_X86_NUMAQ extern int mp_bus_id_to_node[MAX_MP_BUSSES]; diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 27da400d3138..5e01c768a575 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -1,5 +1,33 @@ +#ifndef _ASM_X86_NUMA_H +#define _ASM_X86_NUMA_H + +#include + +#ifdef CONFIG_NUMA +/* + * __apicid_to_node[] stores the raw mapping between physical apicid and + * node and is used to initialize cpu_to_node mapping. + * + * The mapping may be overridden by apic->numa_cpu_node() on 32bit and thus + * should be accessed by the accessors - set_apicid_to_node() and + * numa_cpu_node(). + */ +extern s16 __apicid_to_node[MAX_LOCAL_APIC]; + +static inline void set_apicid_to_node(int apicid, s16 node) +{ + __apicid_to_node[apicid] = node; +} +#else /* CONFIG_NUMA */ +static inline void set_apicid_to_node(int apicid, s16 node) +{ +} +#endif /* CONFIG_NUMA */ + #ifdef CONFIG_X86_32 # include "numa_32.h" #else # include "numa_64.h" #endif + +#endif /* _ASM_X86_NUMA_H */ diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h index b0ef2b449a9d..cdf8043d7a1a 100644 --- a/arch/x86/include/asm/numa_32.h +++ b/arch/x86/include/asm/numa_32.h @@ -6,6 +6,12 @@ extern int numa_off; extern int pxm_to_nid(int pxm); extern void numa_remove_cpu(int cpu); +#ifdef CONFIG_NUMA +extern int __cpuinit numa_cpu_node(int apicid); +#else /* CONFIG_NUMA */ +static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; } +#endif /* CONFIG_NUMA */ + #ifdef CONFIG_HIGHMEM extern void set_highmem_pages_init(void); #else diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 0493be39607c..4982a9c08c2f 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -2,7 +2,6 @@ #define _ASM_X86_NUMA_64_H #include -#include struct bootnode { u64 start; @@ -17,8 +16,6 @@ extern int compute_hash_shift(struct bootnode *nodes, int numblks, extern void numa_init_array(void); extern int numa_off; -extern s16 apicid_to_node[MAX_LOCAL_APIC]; - extern unsigned long numa_free_all_bootmem(void); extern void setup_node_bootmem(int nodeid, unsigned long start, unsigned long end); @@ -32,6 +29,7 @@ extern void setup_node_bootmem(int nodeid, unsigned long start, #define NODE_MIN_SIZE (4*1024*1024) extern void __init init_cpu_to_node(void); +extern int __cpuinit numa_cpu_node(int cpu); extern void __cpuinit numa_set_node(int cpu, int node); extern void __cpuinit numa_clear_node(int cpu); extern void __cpuinit numa_add_cpu(int cpu); @@ -44,6 +42,7 @@ void numa_emu_cmdline(char *); #endif /* CONFIG_NUMA_EMU */ #else static inline void init_cpu_to_node(void) { } +static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; } static inline void numa_set_node(int cpu, int node) { } static inline void numa_clear_node(int cpu) { } static inline void numa_add_cpu(int cpu, int node) { } diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index b3a71137983a..a7bca59ec595 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -589,11 +589,10 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) nid = acpi_get_node(handle); if (nid == -1 || !node_online(nid)) return; + set_apicid_to_node(physid, nid); #ifdef CONFIG_X86_64 - apicid_to_node[physid] = nid; numa_set_node(cpu, nid); #else /* CONFIG_X86_32 */ - apicid_2_node[physid] = nid; cpu_to_node_map[cpu] = nid; #endif diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 0f4f3c152311..4686ea59b7a0 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2026,7 +2026,7 @@ int default_x86_32_numa_cpu_node(int cpu) int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); if (apicid != BAD_APICID) - return apicid_2_node[apicid]; + return __apicid_to_node[apicid]; return NUMA_NO_NODE; #else return 0; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 7c7bedb83c5a..3cce8f2bb2e1 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -234,17 +234,21 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) #endif #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +/* + * To workaround broken NUMA config. Read the comment in + * srat_detect_node(). + */ static int __cpuinit nearby_node(int apicid) { int i, node; for (i = apicid - 1; i >= 0; i--) { - node = apicid_to_node[i]; + node = __apicid_to_node[i]; if (node != NUMA_NO_NODE && node_online(node)) return node; } for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { - node = apicid_to_node[i]; + node = __apicid_to_node[i]; if (node != NUMA_NO_NODE && node_online(node)) return node; } @@ -339,26 +343,35 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) int node; unsigned apicid = c->apicid; - node = per_cpu(cpu_llc_id, cpu); + node = numa_cpu_node(cpu); + if (node == NUMA_NO_NODE) + node = per_cpu(cpu_llc_id, cpu); - if (apicid_to_node[apicid] != NUMA_NO_NODE) - node = apicid_to_node[apicid]; if (!node_online(node)) { - /* Two possibilities here: - - The CPU is missing memory and no node was created. - In that case try picking one from a nearby CPU - - The APIC IDs differ from the HyperTransport node IDs - which the K8 northbridge parsing fills in. - Assume they are all increased by a constant offset, - but in the same order as the HT nodeids. - If that doesn't result in a usable node fall back to the - path for the previous case. */ - + /* + * Two possibilities here: + * + * - The CPU is missing memory and no node was created. In + * that case try picking one from a nearby CPU. + * + * - The APIC IDs differ from the HyperTransport node IDs + * which the K8 northbridge parsing fills in. Assume + * they are all increased by a constant offset, but in + * the same order as the HT nodeids. If that doesn't + * result in a usable node fall back to the path for the + * previous case. + * + * This workaround operates directly on the mapping between + * APIC ID and NUMA node, assuming certain relationship + * between APIC ID, HT node ID and NUMA topology. As going + * through CPU mapping may alter the outcome, directly + * access __apicid_to_node[]. + */ int ht_nodeid = c->initial_apicid; if (ht_nodeid >= 0 && - apicid_to_node[ht_nodeid] != NUMA_NO_NODE) - node = apicid_to_node[ht_nodeid]; + __apicid_to_node[ht_nodeid] != NUMA_NO_NODE) + node = __apicid_to_node[ht_nodeid]; /* Pick a nearby node */ if (!node_online(node)) node = nearby_node(apicid); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index d16c2c53d6bf..6052004bf4f4 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -279,11 +279,10 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) unsigned node; int cpu = smp_processor_id(); - int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid; /* Don't do the funky fallback heuristics the AMD version employs for now. */ - node = apicid_to_node[apicid]; + node = numa_cpu_node(cpu); if (node == NUMA_NO_NODE || !node_online(node)) { /* reuse the value from init_cpu_to_node() */ node = cpu_to_node(cpu); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 5319cdd53765..b7cfce535cb0 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -71,10 +71,6 @@ #include #include -#ifdef CONFIG_X86_32 -u8 apicid_2_node[MAX_LOCAL_APIC]; -#endif - /* State of each CPU */ DEFINE_PER_CPU(int, cpu_state) = { 0 }; @@ -170,7 +166,7 @@ static void map_cpu_to_logical_apicid(void) int cpu = smp_processor_id(); int node; - node = apic->x86_32_numa_cpu_node(cpu); + node = numa_cpu_node(cpu); if (!node_online(node)) node = first_online_node; diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c index f21962c435ed..c7fae38c4080 100644 --- a/arch/x86/mm/amdtopology_64.c +++ b/arch/x86/mm/amdtopology_64.c @@ -247,7 +247,7 @@ void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes) __acpi_map_pxm_to_node(nid, i); #endif } - memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); + memcpy(__apicid_to_node, fake_apicid_to_node, sizeof(__apicid_to_node)); } #endif /* CONFIG_NUMA_EMU */ @@ -285,7 +285,7 @@ int __init amd_scan_nodes(void) nodes[i].start >> PAGE_SHIFT, nodes[i].end >> PAGE_SHIFT); for (j = apicid_base; j < cores + apicid_base; j++) - apicid_to_node[(i << bits) + j] = i; + set_apicid_to_node((i << bits) + j, i); setup_node_bootmem(i, nodes[i].start, nodes[i].end); } diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index ebf6d7887a38..480b3571c8b1 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -26,8 +26,12 @@ static __init int numa_setup(char *opt) early_param("numa", numa_setup); /* - * Which logical CPUs are on which nodes + * apicid, cpu, node mappings */ +s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { + [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE +}; + cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; EXPORT_SYMBOL(node_to_cpumask_map); diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 84a3e4c9f277..8d91d227be09 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -110,6 +110,12 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); static unsigned long kva_start_pfn; static unsigned long kva_pages; + +int __cpuinit numa_cpu_node(int cpu) +{ + return apic->x86_32_numa_cpu_node(cpu); +} + /* * FLAT - support for basic PC memory model with discontig enabled, essentially * a single node with all available processors in it with a flat diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 95ea1551eebc..1e1026f61a5a 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -26,10 +26,6 @@ EXPORT_SYMBOL(node_data); struct memnode memnode; -s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { - [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE -}; - static unsigned long __initdata nodemap_addr; static unsigned long __initdata nodemap_size; @@ -716,12 +712,8 @@ void __init init_cpu_to_node(void) BUG_ON(cpu_to_apicid == NULL); for_each_possible_cpu(cpu) { - int node; - u16 apicid = cpu_to_apicid[cpu]; + int node = numa_cpu_node(cpu); - if (apicid == BAD_APICID) - continue; - node = apicid_to_node[apicid]; if (node == NUMA_NO_NODE) continue; if (!node_online(node)) @@ -731,6 +723,14 @@ void __init init_cpu_to_node(void) } #endif +int __cpuinit numa_cpu_node(int cpu) +{ + int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); + + if (apicid != BAD_APICID) + return __apicid_to_node[apicid]; + return NUMA_NO_NODE; +} void __cpuinit numa_set_node(int cpu, int node) { @@ -776,13 +776,9 @@ void __cpuinit numa_remove_cpu(int cpu) void __cpuinit numa_add_cpu(int cpu) { unsigned long addr; - u16 apicid; - int physnid; - int nid = NUMA_NO_NODE; + int physnid, nid; - apicid = early_per_cpu(x86_cpu_to_apicid, cpu); - if (apicid != BAD_APICID) - nid = apicid_to_node[apicid]; + nid = numa_cpu_node(cpu); if (nid == NUMA_NO_NODE) nid = early_cpu_to_node(cpu); BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c index 6027a4810003..48651c6f657d 100644 --- a/arch/x86/mm/srat_32.c +++ b/arch/x86/mm/srat_32.c @@ -255,7 +255,7 @@ int __init get_memcfg_from_srat(void) num_memory_chunks); for (i = 0; i < MAX_LOCAL_APIC; i++) - apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]); + set_apicid_to_node(i, pxm_to_node(apicid_to_pxm[i])); for (j = 0; j < num_memory_chunks; j++){ struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 603d285d1daa..9a97261a2418 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -79,7 +79,7 @@ static __init void bad_srat(void) printk(KERN_ERR "SRAT: SRAT not used.\n"); acpi_numa = -1; for (i = 0; i < MAX_LOCAL_APIC; i++) - apicid_to_node[i] = NUMA_NO_NODE; + set_apicid_to_node(i, NUMA_NO_NODE); for (i = 0; i < MAX_NUMNODES; i++) { nodes[i].start = nodes[i].end = 0; nodes_add[i].start = nodes_add[i].end = 0; @@ -138,7 +138,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); return; } - apicid_to_node[apic_id] = node; + set_apicid_to_node(apic_id, node); node_set(node, cpu_nodes_parsed); acpi_numa = 1; printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", @@ -178,7 +178,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) return; } - apicid_to_node[apic_id] = node; + set_apicid_to_node(apic_id, node); node_set(node, cpu_nodes_parsed); acpi_numa = 1; printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", @@ -521,7 +521,7 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) * node, it must now point to the fake node ID. */ for (j = 0; j < MAX_LOCAL_APIC; j++) - if (apicid_to_node[j] == nid && + if (__apicid_to_node[j] == nid && fake_apicid_to_node[j] == NUMA_NO_NODE) fake_apicid_to_node[j] = i; } @@ -532,13 +532,13 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) * value. */ for (i = 0; i < MAX_LOCAL_APIC; i++) - if (apicid_to_node[i] != NUMA_NO_NODE && + if (__apicid_to_node[i] != NUMA_NO_NODE && fake_apicid_to_node[i] == NUMA_NO_NODE) fake_apicid_to_node[i] = 0; for (i = 0; i < num_nodes; i++) __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i); - memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); + memcpy(__apicid_to_node, fake_apicid_to_node, sizeof(__apicid_to_node)); nodes_clear(nodes_parsed); for (i = 0; i < num_nodes; i++) From 645a79195f66eb68ef3ab2b21d9829ac3aa085a9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 23 Jan 2011 14:37:40 +0100 Subject: [PATCH 088/706] x86: Unify CPU -> NUMA node mapping between 32 and 64bit Unlike 64bit, 32bit has been using its own cpu_to_node_map[] for CPU -> NUMA node mapping. Replace it with early_percpu variable x86_cpu_to_node_map and share the mapping code with 64bit. * USE_PERCPU_NUMA_NODE_ID is now enabled for 32bit too. * x86_cpu_to_node_map and numa_set/clear_node() are moved from numa_64 to numa. For now, on 32bit, x86_cpu_to_node_map is initialized with 0 instead of NUMA_NO_NODE. This is to avoid introducing unexpected behavior change and will be updated once init path is unified. * srat_detect_node() is now enabled for x86_32 too. It calls numa_set_node() and initializes the mapping making explicit cpu_to_node_map[] updates from map/unmap_cpu_to_node() unnecessary. Signed-off-by: Tejun Heo Cc: eric.dumazet@gmail.com Cc: yinghai@kernel.org Cc: brgerst@gmail.com Cc: gorcunov@gmail.com Cc: penberg@kernel.org Cc: shaohui.zheng@intel.com Cc: rientjes@google.com LKML-Reference: <1295789862-25482-15-git-send-email-tj@kernel.org> Signed-off-by: Ingo Molnar Cc: David Rientjes --- arch/x86/Kconfig | 2 +- arch/x86/include/asm/numa.h | 8 ++++ arch/x86/include/asm/numa_64.h | 4 -- arch/x86/include/asm/topology.h | 17 -------- arch/x86/kernel/acpi/boot.c | 5 --- arch/x86/kernel/cpu/amd.c | 4 +- arch/x86/kernel/cpu/intel.c | 2 +- arch/x86/kernel/setup_percpu.c | 4 +- arch/x86/kernel/smpboot.c | 6 --- arch/x86/mm/numa.c | 72 ++++++++++++++++++++++++++++++++- arch/x86/mm/numa_64.c | 65 ----------------------------- 11 files changed, 85 insertions(+), 104 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d5ed94d30aad..95c36c474766 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1705,7 +1705,7 @@ config HAVE_ARCH_EARLY_PFN_TO_NID depends on NUMA config USE_PERCPU_NUMA_NODE_ID - def_bool X86_64 + def_bool y depends on NUMA menu "Power management and ACPI options" diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 5e01c768a575..2b21fff9f655 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -30,4 +30,12 @@ static inline void set_apicid_to_node(int apicid, s16 node) # include "numa_64.h" #endif +#ifdef CONFIG_NUMA +extern void __cpuinit numa_set_node(int cpu, int node); +extern void __cpuinit numa_clear_node(int cpu); +#else /* CONFIG_NUMA */ +static inline void numa_set_node(int cpu, int node) { } +static inline void numa_clear_node(int cpu) { } +#endif /* CONFIG_NUMA */ + #endif /* _ASM_X86_NUMA_H */ diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 4982a9c08c2f..6ead9f361bda 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -30,8 +30,6 @@ extern void setup_node_bootmem(int nodeid, unsigned long start, extern void __init init_cpu_to_node(void); extern int __cpuinit numa_cpu_node(int cpu); -extern void __cpuinit numa_set_node(int cpu, int node); -extern void __cpuinit numa_clear_node(int cpu); extern void __cpuinit numa_add_cpu(int cpu); extern void __cpuinit numa_remove_cpu(int cpu); @@ -43,8 +41,6 @@ void numa_emu_cmdline(char *); #else static inline void init_cpu_to_node(void) { } static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; } -static inline void numa_set_node(int cpu, int node) { } -static inline void numa_clear_node(int cpu) { } static inline void numa_add_cpu(int cpu, int node) { } static inline void numa_remove_cpu(int cpu) { } #endif diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 21899cc31e52..b101c17861f5 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -47,21 +47,6 @@ #include -#ifdef CONFIG_X86_32 - -/* Mappings between logical cpu number and node number */ -extern int cpu_to_node_map[]; - -/* Returns the number of the node containing CPU 'cpu' */ -static inline int __cpu_to_node(int cpu) -{ - return cpu_to_node_map[cpu]; -} -#define early_cpu_to_node __cpu_to_node -#define cpu_to_node __cpu_to_node - -#else /* CONFIG_X86_64 */ - /* Mappings between logical cpu number and node number */ DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); @@ -84,8 +69,6 @@ static inline int early_cpu_to_node(int cpu) #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ -#endif /* CONFIG_X86_64 */ - /* Mappings between node number and cpus on that node. */ extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index a7bca59ec595..a2c512175395 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -590,12 +590,7 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) if (nid == -1 || !node_online(nid)) return; set_apicid_to_node(physid, nid); -#ifdef CONFIG_X86_64 numa_set_node(cpu, nid); -#else /* CONFIG_X86_32 */ - cpu_to_node_map[cpu] = nid; -#endif - #endif } diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 3cce8f2bb2e1..77858fd64620 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -233,7 +233,7 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) } #endif -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +#ifdef CONFIG_NUMA /* * To workaround broken NUMA config. Read the comment in * srat_detect_node(). @@ -338,7 +338,7 @@ EXPORT_SYMBOL_GPL(amd_get_nb_id); static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) { -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +#ifdef CONFIG_NUMA int cpu = smp_processor_id(); int node; unsigned apicid = c->apicid; diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 6052004bf4f4..df86bc8c859d 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -276,7 +276,7 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) { -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +#ifdef CONFIG_NUMA unsigned node; int cpu = smp_processor_id(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index b5147f00f0ef..71f4727da373 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -233,6 +233,7 @@ void __init setup_per_cpu_areas(void) per_cpu(irq_stack_ptr, cpu) = per_cpu(irq_stack_union.irq_stack, cpu) + IRQ_STACK_SIZE - 64; +#endif #ifdef CONFIG_NUMA per_cpu(x86_cpu_to_node_map, cpu) = early_per_cpu_map(x86_cpu_to_node_map, cpu); @@ -245,7 +246,6 @@ void __init setup_per_cpu_areas(void) * So set them all (boot cpu and all APs). */ set_cpu_numa_node(cpu, early_cpu_to_node(cpu)); -#endif #endif /* * Up to this point, the boot CPU has been using .init.data @@ -263,7 +263,7 @@ void __init setup_per_cpu_areas(void) #ifdef CONFIG_X86_32 early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL; #endif -#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) +#ifdef CONFIG_NUMA early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; #endif diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index b7cfce535cb0..2c203822424f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -133,16 +133,11 @@ EXPORT_PER_CPU_SYMBOL(cpu_info); atomic_t init_deasserted; #if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) -/* which node each logical CPU is on */ -int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 }; -EXPORT_SYMBOL(cpu_to_node_map); - /* set up a mapping between cpu and node. */ static void map_cpu_to_node(int cpu, int node) { printk(KERN_INFO "Mapping cpu %d to node %d\n", cpu, node); cpumask_set_cpu(cpu, node_to_cpumask_map[node]); - cpu_to_node_map[cpu] = node; } /* undo a mapping between cpu and node. */ @@ -153,7 +148,6 @@ static void unmap_cpu_to_node(int cpu) printk(KERN_INFO "Unmapping cpu %d from all nodes\n", cpu); for (node = 0; node < MAX_NUMNODES; node++) cpumask_clear_cpu(cpu, node_to_cpumask_map[node]); - cpu_to_node_map[cpu] = 0; } #else /* !(CONFIG_NUMA && CONFIG_X86_32) */ #define map_cpu_to_node(cpu, node) ({}) diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 480b3571c8b1..187810be3d6c 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -35,6 +35,44 @@ s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; EXPORT_SYMBOL(node_to_cpumask_map); +/* + * Map cpu index to node index + */ +#ifdef CONFIG_X86_32 +DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, 0); +#else +DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); +#endif +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); + +void __cpuinit numa_set_node(int cpu, int node) +{ + int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); + + /* early setting, no percpu area yet */ + if (cpu_to_node_map) { + cpu_to_node_map[cpu] = node; + return; + } + +#ifdef CONFIG_DEBUG_PER_CPU_MAPS + if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { + printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); + dump_stack(); + return; + } +#endif + per_cpu(x86_cpu_to_node_map, cpu) = node; + + if (node != NUMA_NO_NODE) + set_cpu_numa_node(cpu, node); +} + +void __cpuinit numa_clear_node(int cpu) +{ + numa_set_node(cpu, NUMA_NO_NODE); +} + /* * Allocate node_to_cpumask_map based on number of available nodes * Requires node_possible_map to be valid. @@ -62,6 +100,37 @@ void __init setup_node_to_cpumask_map(void) } #ifdef CONFIG_DEBUG_PER_CPU_MAPS + +int __cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) { + printk(KERN_WARNING + "cpu_to_node(%d): usage too early!\n", cpu); + dump_stack(); + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + } + return per_cpu(x86_cpu_to_node_map, cpu); +} +EXPORT_SYMBOL(__cpu_to_node); + +/* + * Same function as cpu_to_node() but used if called before the + * per_cpu areas are setup. + */ +int early_cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + + if (!cpu_possible(cpu)) { + printk(KERN_WARNING + "early_cpu_to_node(%d): no per_cpu area!\n", cpu); + dump_stack(); + return NUMA_NO_NODE; + } + return per_cpu(x86_cpu_to_node_map, cpu); +} + /* * Returns a pointer to the bitmask of CPUs on Node 'node'. */ @@ -84,4 +153,5 @@ const struct cpumask *cpumask_of_node(int node) return node_to_cpumask_map[node]; } EXPORT_SYMBOL(cpumask_of_node); -#endif + +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */ diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 1e1026f61a5a..f5459400644d 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -29,12 +29,6 @@ struct memnode memnode; static unsigned long __initdata nodemap_addr; static unsigned long __initdata nodemap_size; -/* - * Map cpu index to node index - */ -DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); -EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); - /* * Given a shift value, try to populate memnodemap[] * Returns : @@ -732,34 +726,6 @@ int __cpuinit numa_cpu_node(int cpu) return NUMA_NO_NODE; } -void __cpuinit numa_set_node(int cpu, int node) -{ - int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); - - /* early setting, no percpu area yet */ - if (cpu_to_node_map) { - cpu_to_node_map[cpu] = node; - return; - } - -#ifdef CONFIG_DEBUG_PER_CPU_MAPS - if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { - printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); - dump_stack(); - return; - } -#endif - per_cpu(x86_cpu_to_node_map, cpu) = node; - - if (node != NUMA_NO_NODE) - set_cpu_numa_node(cpu, node); -} - -void __cpuinit numa_clear_node(int cpu) -{ - numa_set_node(cpu, NUMA_NO_NODE); -} - #ifndef CONFIG_DEBUG_PER_CPU_MAPS #ifndef CONFIG_NUMA_EMU @@ -887,37 +853,6 @@ void __cpuinit numa_remove_cpu(int cpu) { numa_set_cpumask(cpu, 0); } - -int __cpu_to_node(int cpu) -{ - if (early_per_cpu_ptr(x86_cpu_to_node_map)) { - printk(KERN_WARNING - "cpu_to_node(%d): usage too early!\n", cpu); - dump_stack(); - return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; - } - return per_cpu(x86_cpu_to_node_map, cpu); -} -EXPORT_SYMBOL(__cpu_to_node); - -/* - * Same function as cpu_to_node() but used if called before the - * per_cpu areas are setup. - */ -int early_cpu_to_node(int cpu) -{ - if (early_per_cpu_ptr(x86_cpu_to_node_map)) - return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; - - if (!cpu_possible(cpu)) { - printk(KERN_WARNING - "early_cpu_to_node(%d): no per_cpu area!\n", cpu); - dump_stack(); - return NUMA_NO_NODE; - } - return per_cpu(x86_cpu_to_node_map, cpu); -} - /* * --------- end of debug versions of the numa functions --------- */ From de2d9445f1627830ed2ebd00ee9d851986c940b5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 23 Jan 2011 14:37:41 +0100 Subject: [PATCH 089/706] x86: Unify node_to_cpumask_map handling between 32 and 64bit x86_32 has been managing node_to_cpumask_map explicitly from map_cpu_to_node() and friends in a rather ugly way. With previous changes, it's now possible to share the code with 64bit. * When CONFIG_NUMA_EMU is disabled, numa_add/remove_cpu() are implemented in numa.c and shared by 32 and 64bit. CONFIG_NUMA_EMU versions still live in numa_64.c. NUMA_EMU's dependency on 64bit is planned to be removed and the above should go away together. * identify_cpu() now calls numa_add_cpu() for 32bit too. This makes the explicit mask management from map_cpu_to_node() unnecessary. * The whole x86_32 specific map_cpu_to_node() chunk is no longer necessary. Dropped. Signed-off-by: Tejun Heo Reviewed-by: Pekka Enberg Cc: eric.dumazet@gmail.com Cc: yinghai@kernel.org Cc: brgerst@gmail.com Cc: gorcunov@gmail.com Cc: shaohui.zheng@intel.com Cc: rientjes@google.com LKML-Reference: <1295789862-25482-16-git-send-email-tj@kernel.org> Signed-off-by: Ingo Molnar Cc: David Rientjes Cc: Shaohui Zheng --- arch/x86/include/asm/numa.h | 9 ++++ arch/x86/include/asm/numa_32.h | 1 - arch/x86/include/asm/numa_64.h | 4 -- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/smpboot.c | 47 --------------------- arch/x86/mm/numa.c | 64 ++++++++++++++++++++++++++++- arch/x86/mm/numa_64.c | 75 +++++++--------------------------- 7 files changed, 87 insertions(+), 115 deletions(-) diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 2b21fff9f655..d3964b28b128 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -1,6 +1,7 @@ #ifndef _ASM_X86_NUMA_H #define _ASM_X86_NUMA_H +#include #include #ifdef CONFIG_NUMA @@ -33,9 +34,17 @@ static inline void set_apicid_to_node(int apicid, s16 node) #ifdef CONFIG_NUMA extern void __cpuinit numa_set_node(int cpu, int node); extern void __cpuinit numa_clear_node(int cpu); +extern void __cpuinit numa_add_cpu(int cpu); +extern void __cpuinit numa_remove_cpu(int cpu); #else /* CONFIG_NUMA */ static inline void numa_set_node(int cpu, int node) { } static inline void numa_clear_node(int cpu) { } +static inline void numa_add_cpu(int cpu) { } +static inline void numa_remove_cpu(int cpu) { } #endif /* CONFIG_NUMA */ +#ifdef CONFIG_DEBUG_PER_CPU_MAPS +struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable); +#endif + #endif /* _ASM_X86_NUMA_H */ diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h index cdf8043d7a1a..bc66031afa1f 100644 --- a/arch/x86/include/asm/numa_32.h +++ b/arch/x86/include/asm/numa_32.h @@ -4,7 +4,6 @@ extern int numa_off; extern int pxm_to_nid(int pxm); -extern void numa_remove_cpu(int cpu); #ifdef CONFIG_NUMA extern int __cpuinit numa_cpu_node(int apicid); diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 6ead9f361bda..123f1856101c 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -30,8 +30,6 @@ extern void setup_node_bootmem(int nodeid, unsigned long start, extern void __init init_cpu_to_node(void); extern int __cpuinit numa_cpu_node(int cpu); -extern void __cpuinit numa_add_cpu(int cpu); -extern void __cpuinit numa_remove_cpu(int cpu); #ifdef CONFIG_NUMA_EMU #define FAKE_NODE_MIN_SIZE ((u64)32 << 20) @@ -41,8 +39,6 @@ void numa_emu_cmdline(char *); #else static inline void init_cpu_to_node(void) { } static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; } -static inline void numa_add_cpu(int cpu, int node) { } -static inline void numa_remove_cpu(int cpu) { } #endif #endif /* _ASM_X86_NUMA_64_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 1d59834396bd..a2559c3ed500 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -869,7 +869,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) select_idle_routine(c); -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +#ifdef CONFIG_NUMA numa_add_cpu(smp_processor_id()); #endif } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2c203822424f..522b2173888a 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -132,49 +132,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_info); atomic_t init_deasserted; -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) -/* set up a mapping between cpu and node. */ -static void map_cpu_to_node(int cpu, int node) -{ - printk(KERN_INFO "Mapping cpu %d to node %d\n", cpu, node); - cpumask_set_cpu(cpu, node_to_cpumask_map[node]); -} - -/* undo a mapping between cpu and node. */ -static void unmap_cpu_to_node(int cpu) -{ - int node; - - printk(KERN_INFO "Unmapping cpu %d from all nodes\n", cpu); - for (node = 0; node < MAX_NUMNODES; node++) - cpumask_clear_cpu(cpu, node_to_cpumask_map[node]); -} -#else /* !(CONFIG_NUMA && CONFIG_X86_32) */ -#define map_cpu_to_node(cpu, node) ({}) -#define unmap_cpu_to_node(cpu) ({}) -#endif - -#ifdef CONFIG_X86_32 -static void map_cpu_to_logical_apicid(void) -{ - int cpu = smp_processor_id(); - int node; - - node = numa_cpu_node(cpu); - if (!node_online(node)) - node = first_online_node; - - map_cpu_to_node(cpu, node); -} - -void numa_remove_cpu(int cpu) -{ - unmap_cpu_to_node(cpu); -} -#else -#define map_cpu_to_logical_apicid() do {} while (0) -#endif - /* * Report back to the Boot Processor. * Running on AP. @@ -242,7 +199,6 @@ static void __cpuinit smp_callin(void) apic->smp_callin_clear_local_apic(); setup_local_APIC(); end_local_APIC_setup(); - map_cpu_to_logical_apicid(); /* * Need to setup vector mappings before we enable interrupts. @@ -943,7 +899,6 @@ static __init void disable_smp(void) physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); else physid_set_mask_of_physid(0, &phys_cpu_present_map); - map_cpu_to_logical_apicid(); cpumask_set_cpu(0, cpu_sibling_mask(0)); cpumask_set_cpu(0, cpu_core_mask(0)); } @@ -1120,8 +1075,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) end_local_APIC_setup(); - map_cpu_to_logical_apicid(); - if (apic->setup_portio_remap) apic->setup_portio_remap(); diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 187810be3d6c..75abecb614c9 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -99,7 +99,21 @@ void __init setup_node_to_cpumask_map(void) pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); } -#ifdef CONFIG_DEBUG_PER_CPU_MAPS +#ifndef CONFIG_DEBUG_PER_CPU_MAPS + +# ifndef CONFIG_NUMA_EMU +void __cpuinit numa_add_cpu(int cpu) +{ + cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); +} +# endif /* !CONFIG_NUMA_EMU */ + +#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ int __cpu_to_node(int cpu) { @@ -131,6 +145,52 @@ int early_cpu_to_node(int cpu) return per_cpu(x86_cpu_to_node_map, cpu); } +struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable) +{ + int node = early_cpu_to_node(cpu); + struct cpumask *mask; + char buf[64]; + + mask = node_to_cpumask_map[node]; + if (!mask) { + pr_err("node_to_cpumask_map[%i] NULL\n", node); + dump_stack(); + return NULL; + } + + cpulist_scnprintf(buf, sizeof(buf), mask); + printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", + enable ? "numa_add_cpu" : "numa_remove_cpu", + cpu, node, buf); + return mask; +} + +# ifndef CONFIG_NUMA_EMU +static void __cpuinit numa_set_cpumask(int cpu, int enable) +{ + struct cpumask *mask; + + mask = debug_cpumask_set_cpu(cpu, enable); + if (!mask) + return; + + if (enable) + cpumask_set_cpu(cpu, mask); + else + cpumask_clear_cpu(cpu, mask); +} + +void __cpuinit numa_add_cpu(int cpu) +{ + numa_set_cpumask(cpu, 1); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + numa_set_cpumask(cpu, 0); +} +# endif /* !CONFIG_NUMA_EMU */ + /* * Returns a pointer to the bitmask of CPUs on Node 'node'. */ @@ -154,4 +214,4 @@ const struct cpumask *cpumask_of_node(int node) } EXPORT_SYMBOL(cpumask_of_node); -#endif /* CONFIG_DEBUG_PER_CPU_MAPS */ +#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index f5459400644d..14664f58a759 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -726,19 +726,18 @@ int __cpuinit numa_cpu_node(int cpu) return NUMA_NO_NODE; } -#ifndef CONFIG_DEBUG_PER_CPU_MAPS - -#ifndef CONFIG_NUMA_EMU -void __cpuinit numa_add_cpu(int cpu) -{ - cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); -} - -void __cpuinit numa_remove_cpu(int cpu) -{ - cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); -} -#else +/* + * UGLINESS AHEAD: Currently, CONFIG_NUMA_EMU is 64bit only and makes use + * of 64bit specific data structures. The distinction is artificial and + * should be removed. numa_{add|remove}_cpu() are implemented in numa.c + * for both 32 and 64bit when CONFIG_NUMA_EMU is disabled but here when + * enabled. + * + * NUMA emulation is planned to be made generic and the following and other + * related code should be moved to numa.c. + */ +#ifdef CONFIG_NUMA_EMU +# ifndef CONFIG_DEBUG_PER_CPU_MAPS void __cpuinit numa_add_cpu(int cpu) { unsigned long addr; @@ -778,47 +777,7 @@ void __cpuinit numa_remove_cpu(int cpu) for_each_online_node(i) cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); } -#endif /* !CONFIG_NUMA_EMU */ - -#else /* CONFIG_DEBUG_PER_CPU_MAPS */ -static struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable) -{ - int node = early_cpu_to_node(cpu); - struct cpumask *mask; - char buf[64]; - - mask = node_to_cpumask_map[node]; - if (!mask) { - pr_err("node_to_cpumask_map[%i] NULL\n", node); - dump_stack(); - return NULL; - } - - cpulist_scnprintf(buf, sizeof(buf), mask); - printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", - enable ? "numa_add_cpu" : "numa_remove_cpu", - cpu, node, buf); - return mask; -} - -/* - * --------- debug versions of the numa functions --------- - */ -#ifndef CONFIG_NUMA_EMU -static void __cpuinit numa_set_cpumask(int cpu, int enable) -{ - struct cpumask *mask; - - mask = debug_cpumask_set_cpu(cpu, enable); - if (!mask) - return; - - if (enable) - cpumask_set_cpu(cpu, mask); - else - cpumask_clear_cpu(cpu, mask); -} -#else +# else /* !CONFIG_DEBUG_PER_CPU_MAPS */ static void __cpuinit numa_set_cpumask(int cpu, int enable) { int node = early_cpu_to_node(cpu); @@ -842,7 +801,6 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable) cpumask_clear_cpu(cpu, mask); } } -#endif /* CONFIG_NUMA_EMU */ void __cpuinit numa_add_cpu(int cpu) { @@ -853,8 +811,5 @@ void __cpuinit numa_remove_cpu(int cpu) { numa_set_cpumask(cpu, 0); } -/* - * --------- end of debug versions of the numa functions --------- - */ - -#endif /* CONFIG_DEBUG_PER_CPU_MAPS */ +# endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ +#endif /* CONFIG_NUMA_EMU */ From 8db78cc4b4048e3add40bca1bc3e55057c319256 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 23 Jan 2011 14:37:42 +0100 Subject: [PATCH 090/706] x86: Unify NUMA initialization between 32 and 64bit Now that everything else is unified, NUMA initialization can be unified too. * numa_init_array() and init_cpu_to_node() are moved from numa_64 to numa. * numa_32::initmem_init() is updated to call numa_init_array() and setup_arch() to call init_cpu_to_node() on 32bit too. * x86_cpu_to_node_map is now initialized to NUMA_NO_NODE on 32bit too. This is safe now as numa_init_array() will initialize it early during boot. This makes NUMA mapping fully initialized before setup_per_cpu_areas() on 32bit too and thus makes the first percpu chunk which contains all the static variables and some of dynamic area allocated with NUMA affinity correctly considered. Signed-off-by: Tejun Heo Cc: yinghai@kernel.org Cc: brgerst@gmail.com Cc: gorcunov@gmail.com Cc: shaohui.zheng@intel.com Cc: rientjes@google.com LKML-Reference: <1295789862-25482-17-git-send-email-tj@kernel.org> Signed-off-by: Ingo Molnar Reported-by: Eric Dumazet Reviewed-by: Pekka Enberg --- arch/x86/include/asm/numa.h | 4 ++ arch/x86/include/asm/numa_64.h | 3 -- arch/x86/kernel/setup.c | 2 - arch/x86/mm/numa.c | 76 ++++++++++++++++++++++++++++++++-- arch/x86/mm/numa_32.c | 1 + arch/x86/mm/numa_64.c | 75 --------------------------------- 6 files changed, 77 insertions(+), 84 deletions(-) diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index d3964b28b128..26fc6e2dd0fb 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -34,11 +34,15 @@ static inline void set_apicid_to_node(int apicid, s16 node) #ifdef CONFIG_NUMA extern void __cpuinit numa_set_node(int cpu, int node); extern void __cpuinit numa_clear_node(int cpu); +extern void __init numa_init_array(void); +extern void __init init_cpu_to_node(void); extern void __cpuinit numa_add_cpu(int cpu); extern void __cpuinit numa_remove_cpu(int cpu); #else /* CONFIG_NUMA */ static inline void numa_set_node(int cpu, int node) { } static inline void numa_clear_node(int cpu) { } +static inline void numa_init_array(void) { } +static inline void init_cpu_to_node(void) { } static inline void numa_add_cpu(int cpu) { } static inline void numa_remove_cpu(int cpu) { } #endif /* CONFIG_NUMA */ diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 123f1856101c..2819afa33632 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -13,7 +13,6 @@ extern int compute_hash_shift(struct bootnode *nodes, int numblks, #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) -extern void numa_init_array(void); extern int numa_off; extern unsigned long numa_free_all_bootmem(void); @@ -28,7 +27,6 @@ extern void setup_node_bootmem(int nodeid, unsigned long start, */ #define NODE_MIN_SIZE (4*1024*1024) -extern void __init init_cpu_to_node(void); extern int __cpuinit numa_cpu_node(int cpu); #ifdef CONFIG_NUMA_EMU @@ -37,7 +35,6 @@ extern int __cpuinit numa_cpu_node(int cpu); void numa_emu_cmdline(char *); #endif /* CONFIG_NUMA_EMU */ #else -static inline void init_cpu_to_node(void) { } static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; } #endif diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d3cfe26c0252..12023412fdf7 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1040,9 +1040,7 @@ void __init setup_arch(char **cmdline_p) prefill_possible_map(); -#ifdef CONFIG_X86_64 init_cpu_to_node(); -#endif init_apic_mappings(); ioapic_and_gsi_init(); diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 75abecb614c9..bf60715bd1b7 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -38,11 +38,7 @@ EXPORT_SYMBOL(node_to_cpumask_map); /* * Map cpu index to node index */ -#ifdef CONFIG_X86_32 -DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, 0); -#else DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); -#endif EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); void __cpuinit numa_set_node(int cpu, int node) @@ -99,6 +95,78 @@ void __init setup_node_to_cpumask_map(void) pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); } +/* + * There are unfortunately some poorly designed mainboards around that + * only connect memory to a single CPU. This breaks the 1:1 cpu->node + * mapping. To avoid this fill in the mapping for all possible CPUs, + * as the number of CPUs is not known yet. We round robin the existing + * nodes. + */ +void __init numa_init_array(void) +{ + int rr, i; + + rr = first_node(node_online_map); + for (i = 0; i < nr_cpu_ids; i++) { + if (early_cpu_to_node(i) != NUMA_NO_NODE) + continue; + numa_set_node(i, rr); + rr = next_node(rr, node_online_map); + if (rr == MAX_NUMNODES) + rr = first_node(node_online_map); + } +} + +static __init int find_near_online_node(int node) +{ + int n, val; + int min_val = INT_MAX; + int best_node = -1; + + for_each_online_node(n) { + val = node_distance(node, n); + + if (val < min_val) { + min_val = val; + best_node = n; + } + } + + return best_node; +} + +/* + * Setup early cpu_to_node. + * + * Populate cpu_to_node[] only if x86_cpu_to_apicid[], + * and apicid_to_node[] tables have valid entries for a CPU. + * This means we skip cpu_to_node[] initialisation for NUMA + * emulation and faking node case (when running a kernel compiled + * for NUMA on a non NUMA box), which is OK as cpu_to_node[] + * is already initialized in a round robin manner at numa_init_array, + * prior to this call, and this initialization is good enough + * for the fake NUMA cases. + * + * Called before the per_cpu areas are setup. + */ +void __init init_cpu_to_node(void) +{ + int cpu; + u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); + + BUG_ON(cpu_to_apicid == NULL); + + for_each_possible_cpu(cpu) { + int node = numa_cpu_node(cpu); + + if (node == NUMA_NO_NODE) + continue; + if (!node_online(node)) + node = find_near_online_node(node); + numa_set_node(cpu, node); + } +} + #ifndef CONFIG_DEBUG_PER_CPU_MAPS # ifndef CONFIG_NUMA_EMU diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 8d91d227be09..505bb04654b5 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -367,6 +367,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, */ get_memcfg_numa(); + numa_init_array(); kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 14664f58a759..f548fbf75f44 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -224,28 +224,6 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) node_set_online(nodeid); } -/* - * There are unfortunately some poorly designed mainboards around that - * only connect memory to a single CPU. This breaks the 1:1 cpu->node - * mapping. To avoid this fill in the mapping for all possible CPUs, - * as the number of CPUs is not known yet. We round robin the existing - * nodes. - */ -void __init numa_init_array(void) -{ - int rr, i; - - rr = first_node(node_online_map); - for (i = 0; i < nr_cpu_ids; i++) { - if (early_cpu_to_node(i) != NUMA_NO_NODE) - continue; - numa_set_node(i, rr); - rr = next_node(rr, node_online_map); - if (rr == MAX_NUMNODES) - rr = first_node(node_online_map); - } -} - #ifdef CONFIG_NUMA_EMU /* Numa emulation */ static struct bootnode nodes[MAX_NUMNODES] __initdata; @@ -664,59 +642,6 @@ unsigned long __init numa_free_all_bootmem(void) return pages; } -#ifdef CONFIG_NUMA - -static __init int find_near_online_node(int node) -{ - int n, val; - int min_val = INT_MAX; - int best_node = -1; - - for_each_online_node(n) { - val = node_distance(node, n); - - if (val < min_val) { - min_val = val; - best_node = n; - } - } - - return best_node; -} - -/* - * Setup early cpu_to_node. - * - * Populate cpu_to_node[] only if x86_cpu_to_apicid[], - * and apicid_to_node[] tables have valid entries for a CPU. - * This means we skip cpu_to_node[] initialisation for NUMA - * emulation and faking node case (when running a kernel compiled - * for NUMA on a non NUMA box), which is OK as cpu_to_node[] - * is already initialized in a round robin manner at numa_init_array, - * prior to this call, and this initialization is good enough - * for the fake NUMA cases. - * - * Called before the per_cpu areas are setup. - */ -void __init init_cpu_to_node(void) -{ - int cpu; - u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); - - BUG_ON(cpu_to_apicid == NULL); - - for_each_possible_cpu(cpu) { - int node = numa_cpu_node(cpu); - - if (node == NUMA_NO_NODE) - continue; - if (!node_online(node)) - node = find_near_online_node(node); - numa_set_node(cpu, node); - } -} -#endif - int __cpuinit numa_cpu_node(int cpu) { int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); From 4e62445b90ac4ef708bd11c7ae052b1d5ef765b5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jan 2011 17:22:48 +0100 Subject: [PATCH 091/706] x86: Fix build failure on X86_UP_APIC Commit 4c321ff8 (x86: Replace cpu_2_logical_apicid[] with early percpu variable) and following changes introduced and used x86_cpu_to_logical_apicid percpu variable. It was declared and defined inside CONFIG_SMP && CONFIG_X86_32 but if CONFIG_X86_UP_APIC is set UP configuration makes use of it and build fails. Fix it by declaring and defining it inside CONFIG_X86_LOCAL_APIC && CONFIG_X86_32. Signed-off-by: Tejun Heo Reported-by: Ingo Molnar Cc: eric.dumazet@gmail.com Cc: yinghai@kernel.org Cc: brgerst@gmail.com Cc: gorcunov@gmail.com Cc: penberg@kernel.org Cc: shaohui.zheng@intel.com Cc: rientjes@google.com LKML-Reference: <20110128162248.GA25746@htj.dyndns.org> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/smp.h | 2 +- arch/x86/kernel/apic/apic.c | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index dc7c46a89db7..75927822c5c8 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -38,7 +38,7 @@ static inline struct cpumask *cpu_core_mask(int cpu) DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid); DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); -#if defined(CONFIG_SMP) && defined(CONFIG_X86_32) +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) DECLARE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid); #endif diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 4686ea59b7a0..1390cf985afd 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -79,7 +79,6 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); #ifdef CONFIG_X86_32 -#ifdef CONFIG_SMP /* * On x86_32, the mapping between cpu and logical apicid may vary * depending on apic in use. The following early percpu variable is @@ -87,7 +86,6 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); * actually diverge. Let's keep it ugly for now. */ DEFINE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid, BAD_APICID); -#endif /* * Knob to control our willingness to enable the local APIC. From dc82009aac6ee6e423b48de43a251745c62ab012 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 28 Jan 2011 14:49:19 -0200 Subject: [PATCH 092/706] perf record: No need to check for overwrites As we open the mmap with (PROT_READ | PROT_WRITE), signalling the kernel with perf_mmap__write_tail() when consuming data, so the kernel will not overwrite. Suggested-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index d7886307f6f4..caf927978d92 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -115,27 +115,11 @@ static void mmap_read(struct perf_mmap *md) unsigned char *data = md->base + page_size; unsigned long size; void *buf; - int diff; - /* - * If we're further behind than half the buffer, there's a chance - * the writer will bite our tail and mess up the samples under us. - * - * If we somehow ended up ahead of the head, we got messed up. - * - * In either case, truncate and restart at head. - */ - diff = head - old; - if (diff < 0) { - fprintf(stderr, "WARNING: failed to keep up with mmap data\n"); - /* - * head points to a known good entry, start there. - */ - old = head; - } + if (old == head) + return; - if (old != head) - samples++; + samples++; size = head - old; From ef2bf6d043ac9bd4a6f38d862af407154a4754d9 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 29 Jan 2011 09:04:40 -0200 Subject: [PATCH 093/706] perf events: Account PERF_RECORD_LOST events in event__process Right now this function is only used by perf top, that uses PROT_READ only, i.e. overwrite mode, so no PERF_RECORD_LOST events are generated, but don't forget those events. The patch that moved this out of perf top was made so that this routine could be used by 'perf probe' in the uprobes patchset, so perhaps there they need to check for LOST events and warn the user, as will be done in the following patches that will switch 'perf top' to non overwrite mode (mmap with PROT_READ|PROT_WRITE). Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/event.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index e4db8b888546..b9d4ac87f9f7 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -646,6 +646,8 @@ int event__process(event_t *event, struct sample_data *sample, case PERF_RECORD_EXIT: event__process_task(event, sample, session); break; + case PERF_RECORD_LOST: + event__process_lost(event, sample, session); default: break; } From 7bb41152b9be7e31f10d8919bce5034135525d9d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 29 Jan 2011 09:08:13 -0200 Subject: [PATCH 094/706] perf evlist: Support non overwrite mode in perf_evlist__read_on_cpu I.e. stash the overwrite mode in struct perf_evlist and act accordingly in perf_evlist__read_on_cpu, not checking for overwrites and touching the tail after consuming one event, like perf record does, for instance. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 35 ++++++++++++++++++++--------------- tools/perf/util/evlist.h | 1 + tools/perf/util/evsel.c | 1 + 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index df0610e9c61b..b498eecbe850 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -116,24 +116,25 @@ event_t *perf_evlist__read_on_cpu(struct perf_evlist *evlist, int cpu) unsigned int old = md->prev; unsigned char *data = md->base + page_size; event_t *event = NULL; - int diff; - - /* - * If we're further behind than half the buffer, there's a chance - * the writer will bite our tail and mess up the samples under us. - * - * If we somehow ended up ahead of the head, we got messed up. - * - * In either case, truncate and restart at head. - */ - diff = head - old; - if (diff > md->mask / 2 || diff < 0) { - fprintf(stderr, "WARNING: failed to keep up with mmap data.\n"); + if (evlist->overwrite) { /* - * head points to a known good entry, start there. + * If we're further behind than half the buffer, there's a chance + * the writer will bite our tail and mess up the samples under us. + * + * If we somehow ended up ahead of the head, we got messed up. + * + * In either case, truncate and restart at head. */ - old = head; + int diff = head - old; + if (diff > md->mask / 2 || diff < 0) { + fprintf(stderr, "WARNING: failed to keep up with mmap data.\n"); + + /* + * head points to a known good entry, start there. + */ + old = head; + } } if (old != head) { @@ -166,5 +167,9 @@ event_t *perf_evlist__read_on_cpu(struct perf_evlist *evlist, int cpu) } md->prev = old; + + if (!evlist->overwrite) + perf_mmap__write_tail(md, old); + return event; } diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index acbe48eac608..2706ae40c8b3 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -16,6 +16,7 @@ struct perf_evlist { int nr_entries; int nr_fds; int mmap_len; + bool overwrite; event_t event_copy; struct perf_mmap *mmap; struct pollfd *pollfd; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 76ab553637d6..f98b3e585039 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -327,6 +327,7 @@ int perf_evlist__mmap(struct perf_evlist *evlist, struct cpu_map *cpus, perf_evlist__alloc_pollfd(evlist, cpus->nr, threads->nr) < 0) return -ENOMEM; + evlist->overwrite = overwrite; evlist->mmap_len = (pages + 1) * page_size; first_evsel = list_entry(evlist->entries.next, struct perf_evsel, node); From 93fc64f14472ae24fd640bf3834a178f59142842 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 29 Jan 2011 12:08:00 -0200 Subject: [PATCH 095/706] perf top: Switch to non overwrite mode Just like 'perf record'. Warn the user when PERF_RECORD_LOST events happen. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-top.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index ce2e50c891c7..7f92ab7696f7 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -464,7 +464,7 @@ static void rb_insert_active_sym(struct rb_root *tree, struct sym_entry *se) rb_insert_color(&se->rb_node, tree); } -static void print_sym_table(void) +static void print_sym_table(struct perf_session *session) { int printed = 0, j; struct perf_evsel *counter; @@ -513,7 +513,6 @@ static void print_sym_table(void) puts(CONSOLE_CLEAR); - printf("%-*.*s\n", win_width, win_width, graph_dotted_line); if (!perf_guest) { printf(" PerfTop:%8.0f irqs/sec kernel:%4.1f%%" " exact: %4.1f%% [", @@ -578,6 +577,12 @@ static void print_sym_table(void) printf("%-*.*s\n", win_width, win_width, graph_dotted_line); + if (session->hists.stats.total_lost != 0) { + color_fprintf(stdout, PERF_COLOR_RED, "WARNING:"); + printf(" LOST %" PRIu64 " events, Check IO/CPU overload\n", + session->hists.stats.total_lost); + } + if (sym_filter_entry) { show_details(sym_filter_entry); return; @@ -919,7 +924,7 @@ repeat: getc(stdin); do { - print_sym_table(); + print_sym_table(session); } while (!poll(&stdin_poll, 1, delay_msecs) == 1); c = getc(stdin); @@ -1176,7 +1181,7 @@ try_again: } } - if (perf_evlist__mmap(evlist, cpus, threads, mmap_pages, true) < 0) + if (perf_evlist__mmap(evlist, cpus, threads, mmap_pages, false) < 0) die("failed to mmap with %d (%s)\n", errno, strerror(errno)); } From 8d50e5b4171a69cf48ca94a1e7c14033d0b4771d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 29 Jan 2011 13:02:00 -0200 Subject: [PATCH 096/706] perf tools: Rename 'struct sample_data' to 'struct perf_sample' Making the namespace more uniform. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 2 +- tools/perf/builtin-diff.c | 2 +- tools/perf/builtin-inject.c | 8 ++++---- tools/perf/builtin-kmem.c | 2 +- tools/perf/builtin-lock.c | 2 +- tools/perf/builtin-record.c | 2 +- tools/perf/builtin-report.c | 36 +++++++++++++++++----------------- tools/perf/builtin-sched.c | 2 +- tools/perf/builtin-script.c | 2 +- tools/perf/builtin-test.c | 2 +- tools/perf/builtin-timechart.c | 8 ++++---- tools/perf/builtin-top.c | 4 ++-- tools/perf/util/build-id.c | 4 ++-- tools/perf/util/event.c | 16 +++++++-------- tools/perf/util/event.h | 18 ++++++++--------- tools/perf/util/evsel.c | 4 ++-- tools/perf/util/session.c | 26 ++++++++++++------------ tools/perf/util/session.h | 4 ++-- 18 files changed, 72 insertions(+), 72 deletions(-) diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 8879463807e4..ef3675135414 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -58,7 +58,7 @@ static int hists__add_entry(struct hists *self, struct addr_location *al) return hist_entry__inc_addr_samples(he, al->addr); } -static int process_sample_event(event_t *event, struct sample_data *sample, +static int process_sample_event(event_t *event, struct perf_sample *sample, struct perf_session *session) { struct addr_location al; diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c index 3153e492dbcc..0822149dc768 100644 --- a/tools/perf/builtin-diff.c +++ b/tools/perf/builtin-diff.c @@ -31,7 +31,7 @@ static int hists__add_entry(struct hists *self, } static int diff__process_sample_event(event_t *event, - struct sample_data *sample, + struct perf_sample *sample, struct perf_session *session) { struct addr_location al; diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 0c78ffa7bf67..4c9388ce878c 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -36,13 +36,13 @@ static int event__repipe_synth(event_t *event, return 0; } -static int event__repipe(event_t *event, struct sample_data *sample __used, +static int event__repipe(event_t *event, struct perf_sample *sample __used, struct perf_session *session) { return event__repipe_synth(event, session); } -static int event__repipe_mmap(event_t *self, struct sample_data *sample, +static int event__repipe_mmap(event_t *self, struct perf_sample *sample, struct perf_session *session) { int err; @@ -53,7 +53,7 @@ static int event__repipe_mmap(event_t *self, struct sample_data *sample, return err; } -static int event__repipe_task(event_t *self, struct sample_data *sample, +static int event__repipe_task(event_t *self, struct perf_sample *sample, struct perf_session *session) { int err; @@ -119,7 +119,7 @@ static int dso__inject_build_id(struct dso *self, struct perf_session *session) return 0; } -static int event__inject_buildid(event_t *event, struct sample_data *sample, +static int event__inject_buildid(event_t *event, struct perf_sample *sample, struct perf_session *session) { struct addr_location al; diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index d97256d65980..3c1cdcf29902 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -304,7 +304,7 @@ process_raw_event(event_t *raw_event __used, void *data, } } -static int process_sample_event(event_t *event, struct sample_data *sample, +static int process_sample_event(event_t *event, struct perf_sample *sample, struct perf_session *session) { struct thread *thread = perf_session__findnew(session, event->ip.pid); diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index 2b36defc5d73..c3f512791344 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -834,7 +834,7 @@ static void dump_info(void) die("Unknown type of information\n"); } -static int process_sample_event(event_t *self, struct sample_data *sample, +static int process_sample_event(event_t *self, struct perf_sample *sample, struct perf_session *s) { struct thread *thread = perf_session__findnew(s, sample->tid); diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index caf927978d92..5d3e4b32072b 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -101,7 +101,7 @@ static void write_output(void *buf, size_t size) } static int process_synthesized_event(event_t *event, - struct sample_data *sample __used, + struct perf_sample *sample __used, struct perf_session *self __used) { write_output(event, event->header.size); diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index f6a43493d1d0..bbbadcc04097 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -77,9 +77,9 @@ static struct hists *perf_session__hists_findnew(struct perf_session *self, return new; } -static int perf_session__add_hist_entry(struct perf_session *self, +static int perf_session__add_hist_entry(struct perf_session *session, struct addr_location *al, - struct sample_data *data) + struct perf_sample *sample) { struct symbol *parent = NULL; int err = 0; @@ -87,28 +87,28 @@ static int perf_session__add_hist_entry(struct perf_session *self, struct hists *hists; struct perf_event_attr *attr; - if ((sort__has_parent || symbol_conf.use_callchain) && data->callchain) { - err = perf_session__resolve_callchain(self, al->thread, - data->callchain, &parent); + if ((sort__has_parent || symbol_conf.use_callchain) && sample->callchain) { + err = perf_session__resolve_callchain(session, al->thread, + sample->callchain, &parent); if (err) return err; } - attr = perf_header__find_attr(data->id, &self->header); + attr = perf_header__find_attr(sample->id, &session->header); if (attr) - hists = perf_session__hists_findnew(self, data->id, attr->type, attr->config); + hists = perf_session__hists_findnew(session, sample->id, attr->type, attr->config); else - hists = perf_session__hists_findnew(self, data->id, 0, 0); + hists = perf_session__hists_findnew(session, sample->id, 0, 0); if (hists == NULL) return -ENOMEM; - he = __hists__add_entry(hists, al, parent, data->period); + he = __hists__add_entry(hists, al, parent, sample->period); if (he == NULL) return -ENOMEM; if (symbol_conf.use_callchain) { - err = callchain_append(he->callchain, &self->callchain_cursor, - data->period); + err = callchain_append(he->callchain, &session->callchain_cursor, + sample->period); if (err) return err; } @@ -124,32 +124,32 @@ static int perf_session__add_hist_entry(struct perf_session *self, } static int add_event_total(struct perf_session *session, - struct sample_data *data, + struct perf_sample *sample, struct perf_event_attr *attr) { struct hists *hists; if (attr) - hists = perf_session__hists_findnew(session, data->id, + hists = perf_session__hists_findnew(session, sample->id, attr->type, attr->config); else - hists = perf_session__hists_findnew(session, data->id, 0, 0); + hists = perf_session__hists_findnew(session, sample->id, 0, 0); if (!hists) return -ENOMEM; - hists->stats.total_period += data->period; + hists->stats.total_period += sample->period; /* * FIXME: add_event_total should be moved from here to * perf_session__process_event so that the proper hist is passed to * the event_op methods. */ hists__inc_nr_events(hists, PERF_RECORD_SAMPLE); - session->hists.stats.total_period += data->period; + session->hists.stats.total_period += sample->period; return 0; } -static int process_sample_event(event_t *event, struct sample_data *sample, +static int process_sample_event(event_t *event, struct perf_sample *sample, struct perf_session *session) { struct addr_location al; @@ -179,7 +179,7 @@ static int process_sample_event(event_t *event, struct sample_data *sample, return 0; } -static int process_read_event(event_t *event, struct sample_data *sample __used, +static int process_read_event(event_t *event, struct perf_sample *sample __used, struct perf_session *session __used) { struct perf_event_attr *attr; diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 29acb894e035..ff993c8b175d 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1607,7 +1607,7 @@ process_raw_event(event_t *raw_event __used, struct perf_session *session, process_sched_migrate_task_event(data, session, event, cpu, timestamp, thread); } -static int process_sample_event(event_t *event, struct sample_data *sample, +static int process_sample_event(event_t *event, struct perf_sample *sample, struct perf_session *session) { struct thread *thread; diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index b766c2a9ac97..5c4c809008af 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -63,7 +63,7 @@ static int cleanup_scripting(void) static char const *input_name = "perf.data"; -static int process_sample_event(event_t *event, struct sample_data *sample, +static int process_sample_event(event_t *event, struct perf_sample *sample, struct perf_session *session) { struct thread *thread = perf_session__findnew(session, event->ip.pid); diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c index 738830d298e8..df62433a34a7 100644 --- a/tools/perf/builtin-test.c +++ b/tools/perf/builtin-test.c @@ -550,7 +550,7 @@ static int test__basic_mmap(void) } while ((event = perf_evlist__read_on_cpu(evlist, 0)) != NULL) { - struct sample_data sample; + struct perf_sample sample; if (event->header.type != PERF_RECORD_SAMPLE) { pr_debug("unexpected %s event\n", diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c index 746cf03cb05d..01cf0c3771a6 100644 --- a/tools/perf/builtin-timechart.c +++ b/tools/perf/builtin-timechart.c @@ -276,21 +276,21 @@ static int cpus_cstate_state[MAX_CPUS]; static u64 cpus_pstate_start_times[MAX_CPUS]; static u64 cpus_pstate_state[MAX_CPUS]; -static int process_comm_event(event_t *event, struct sample_data *sample __used, +static int process_comm_event(event_t *event, struct perf_sample *sample __used, struct perf_session *session __used) { pid_set_comm(event->comm.tid, event->comm.comm); return 0; } -static int process_fork_event(event_t *event, struct sample_data *sample __used, +static int process_fork_event(event_t *event, struct perf_sample *sample __used, struct perf_session *session __used) { pid_fork(event->fork.pid, event->fork.ppid, event->fork.time); return 0; } -static int process_exit_event(event_t *event, struct sample_data *sample __used, +static int process_exit_event(event_t *event, struct perf_sample *sample __used, struct perf_session *session __used) { pid_exit(event->fork.pid, event->fork.time); @@ -487,7 +487,7 @@ static void sched_switch(int cpu, u64 timestamp, struct trace_entry *te) static int process_sample_event(event_t *event __used, - struct sample_data *sample, + struct perf_sample *sample, struct perf_session *session) { struct trace_entry *te; diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 7f92ab7696f7..d923127b41b6 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -997,7 +997,7 @@ static int symbol_filter(struct map *map, struct symbol *sym) } static void event__process_sample(const event_t *self, - struct sample_data *sample, + struct perf_sample *sample, struct perf_session *session) { u64 ip = self->ip.ip; @@ -1107,7 +1107,7 @@ static void event__process_sample(const event_t *self, static void perf_session__mmap_read_cpu(struct perf_session *self, int cpu) { - struct sample_data sample; + struct perf_sample sample; event_t *event; while ((event = perf_evlist__read_on_cpu(evsel_list, cpu)) != NULL) { diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c index deffb8c96071..b184a7fa0843 100644 --- a/tools/perf/util/build-id.c +++ b/tools/perf/util/build-id.c @@ -15,7 +15,7 @@ #include "debug.h" static int build_id__mark_dso_hit(event_t *event, - struct sample_data *sample __used, + struct perf_sample *sample __used, struct perf_session *session) { struct addr_location al; @@ -37,7 +37,7 @@ static int build_id__mark_dso_hit(event_t *event, return 0; } -static int event__exit_del_thread(event_t *self, struct sample_data *sample __used, +static int event__exit_del_thread(event_t *self, struct perf_sample *sample __used, struct perf_session *session) { struct thread *thread = perf_session__findnew(session, self->fork.tid); diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index b9d4ac87f9f7..5c886fbd50ce 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -34,7 +34,7 @@ const char *event__get_event_name(unsigned int id) return event__name[id]; } -static struct sample_data synth_sample = { +static struct perf_sample synth_sample = { .pid = -1, .tid = -1, .time = -1, @@ -440,7 +440,7 @@ static int thread__set_comm_adjust(struct thread *self, const char *comm, return 0; } -int event__process_comm(event_t *self, struct sample_data *sample __used, +int event__process_comm(event_t *self, struct perf_sample *sample __used, struct perf_session *session) { struct thread *thread = perf_session__findnew(session, self->comm.tid); @@ -456,7 +456,7 @@ int event__process_comm(event_t *self, struct sample_data *sample __used, return 0; } -int event__process_lost(event_t *self, struct sample_data *sample __used, +int event__process_lost(event_t *self, struct perf_sample *sample __used, struct perf_session *session) { dump_printf(": id:%" PRIu64 ": lost:%" PRIu64 "\n", @@ -567,7 +567,7 @@ out_problem: return -1; } -int event__process_mmap(event_t *self, struct sample_data *sample __used, +int event__process_mmap(event_t *self, struct perf_sample *sample __used, struct perf_session *session) { struct machine *machine; @@ -609,7 +609,7 @@ out_problem: return 0; } -int event__process_task(event_t *self, struct sample_data *sample __used, +int event__process_task(event_t *self, struct perf_sample *sample __used, struct perf_session *session) { struct thread *thread = perf_session__findnew(session, self->fork.tid); @@ -632,7 +632,7 @@ int event__process_task(event_t *self, struct sample_data *sample __used, return 0; } -int event__process(event_t *event, struct sample_data *sample, +int event__process(event_t *event, struct perf_sample *sample, struct perf_session *session) { switch (event->header.type) { @@ -757,7 +757,7 @@ static void dso__calc_col_width(struct dso *self, struct hists *hists) } int event__preprocess_sample(const event_t *self, struct perf_session *session, - struct addr_location *al, struct sample_data *data, + struct addr_location *al, struct perf_sample *sample, symbol_filter_t filter) { u8 cpumode = self->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; @@ -788,7 +788,7 @@ int event__preprocess_sample(const event_t *self, struct perf_session *session, al->map ? al->map->dso->long_name : al->level == 'H' ? "[hypervisor]" : ""); al->sym = NULL; - al->cpu = data->cpu; + al->cpu = sample->cpu; if (al->map) { if (symbol_conf.dso_list && diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index d79e4edd82f9..84fd71ffd619 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -61,7 +61,7 @@ struct sample_event { u64 array[]; }; -struct sample_data { +struct perf_sample { u64 ip; u32 pid, tid; u64 time; @@ -138,7 +138,7 @@ struct perf_session; typedef int (*event__handler_synth_t)(event_t *event, struct perf_session *session); -typedef int (*event__handler_t)(event_t *event, struct sample_data *sample, +typedef int (*event__handler_t)(event_t *event, struct perf_sample *sample, struct perf_session *session); int event__synthesize_thread(pid_t pid, event__handler_t process, @@ -154,25 +154,25 @@ int event__synthesize_modules(event__handler_t process, struct perf_session *session, struct machine *machine); -int event__process_comm(event_t *self, struct sample_data *sample, +int event__process_comm(event_t *event, struct perf_sample *sample, struct perf_session *session); -int event__process_lost(event_t *self, struct sample_data *sample, +int event__process_lost(event_t *event, struct perf_sample *sample, struct perf_session *session); -int event__process_mmap(event_t *self, struct sample_data *sample, +int event__process_mmap(event_t *event, struct perf_sample *sample, struct perf_session *session); -int event__process_task(event_t *self, struct sample_data *sample, +int event__process_task(event_t *event, struct perf_sample *sample, struct perf_session *session); -int event__process(event_t *event, struct sample_data *sample, +int event__process(event_t *event, struct perf_sample *sample, struct perf_session *session); struct addr_location; int event__preprocess_sample(const event_t *self, struct perf_session *session, - struct addr_location *al, struct sample_data *data, + struct addr_location *al, struct perf_sample *sample, symbol_filter_t filter); const char *event__get_event_name(unsigned int id); int event__parse_sample(const event_t *event, u64 type, bool sample_id_all, - struct sample_data *sample); + struct perf_sample *sample); #endif /* __PERF_RECORD_H */ diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index f98b3e585039..a13488511880 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -368,7 +368,7 @@ out_unmap: } static int event__parse_id_sample(const event_t *event, u64 type, - struct sample_data *sample) + struct perf_sample *sample) { const u64 *array = event->sample.array; @@ -406,7 +406,7 @@ static int event__parse_id_sample(const event_t *event, u64 type, } int event__parse_sample(const event_t *event, u64 type, bool sample_id_all, - struct sample_data *data) + struct perf_sample *data) { const u64 *array; diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index e6a07408669e..ee0b61102571 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -67,7 +67,7 @@ out_close: static void perf_session__id_header_size(struct perf_session *session) { - struct sample_data *data; + struct perf_sample *data; u64 sample_type = session->sample_type; u16 size = 0; @@ -299,7 +299,7 @@ static int process_event_synth_stub(event_t *event __used, } static int process_event_stub(event_t *event __used, - struct sample_data *sample __used, + struct perf_sample *sample __used, struct perf_session *session __used) { dump_printf(": unhandled!\n"); @@ -475,7 +475,7 @@ static void perf_session_free_sample_buffers(struct perf_session *session) static int perf_session_deliver_event(struct perf_session *session, event_t *event, - struct sample_data *sample, + struct perf_sample *sample, struct perf_event_ops *ops, u64 file_offset); @@ -485,7 +485,7 @@ static void flush_sample_queue(struct perf_session *s, struct ordered_samples *os = &s->ordered_samples; struct list_head *head = &os->samples; struct sample_queue *tmp, *iter; - struct sample_data sample; + struct perf_sample sample; u64 limit = os->next_flush; u64 last_ts = os->last_sample ? os->last_sample->timestamp : 0ULL; @@ -610,11 +610,11 @@ static void __queue_event(struct sample_queue *new, struct perf_session *s) #define MAX_SAMPLE_BUFFER (64 * 1024 / sizeof(struct sample_queue)) static int perf_session_queue_event(struct perf_session *s, event_t *event, - struct sample_data *data, u64 file_offset) + struct perf_sample *sample, u64 file_offset) { struct ordered_samples *os = &s->ordered_samples; struct list_head *sc = &os->sample_cache; - u64 timestamp = data->time; + u64 timestamp = sample->time; struct sample_queue *new; if (!timestamp || timestamp == ~0ULL) @@ -650,7 +650,7 @@ static int perf_session_queue_event(struct perf_session *s, event_t *event, return 0; } -static void callchain__printf(struct sample_data *sample) +static void callchain__printf(struct perf_sample *sample) { unsigned int i; @@ -663,7 +663,7 @@ static void callchain__printf(struct sample_data *sample) static void perf_session__print_tstamp(struct perf_session *session, event_t *event, - struct sample_data *sample) + struct perf_sample *sample) { if (event->header.type != PERF_RECORD_SAMPLE && !session->sample_id_all) { @@ -679,7 +679,7 @@ static void perf_session__print_tstamp(struct perf_session *session, } static void dump_event(struct perf_session *session, event_t *event, - u64 file_offset, struct sample_data *sample) + u64 file_offset, struct perf_sample *sample) { if (!dump_trace) return; @@ -697,7 +697,7 @@ static void dump_event(struct perf_session *session, event_t *event, } static void dump_sample(struct perf_session *session, event_t *event, - struct sample_data *sample) + struct perf_sample *sample) { if (!dump_trace) return; @@ -712,7 +712,7 @@ static void dump_sample(struct perf_session *session, event_t *event, static int perf_session_deliver_event(struct perf_session *session, event_t *event, - struct sample_data *sample, + struct perf_sample *sample, struct perf_event_ops *ops, u64 file_offset) { @@ -745,7 +745,7 @@ static int perf_session_deliver_event(struct perf_session *session, } static int perf_session__preprocess_sample(struct perf_session *session, - event_t *event, struct sample_data *sample) + event_t *event, struct perf_sample *sample) { if (event->header.type != PERF_RECORD_SAMPLE || !(session->sample_type & PERF_SAMPLE_CALLCHAIN)) @@ -789,7 +789,7 @@ static int perf_session__process_event(struct perf_session *session, struct perf_event_ops *ops, u64 file_offset) { - struct sample_data sample; + struct perf_sample sample; int ret; if (session->header.needs_swap && event__swap_ops[event->header.type]) diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index 78239767011e..365bf533a396 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -57,7 +57,7 @@ struct perf_session { struct perf_event_ops; -typedef int (*event_op)(event_t *self, struct sample_data *sample, +typedef int (*event_op)(event_t *self, struct perf_sample *sample, struct perf_session *session); typedef int (*event_synth_op)(event_t *self, struct perf_session *session); typedef int (*event_op2)(event_t *self, struct perf_session *session, @@ -158,7 +158,7 @@ size_t perf_session__fprintf_nr_events(struct perf_session *self, FILE *fp) static inline int perf_session__parse_sample(struct perf_session *session, const event_t *event, - struct sample_data *sample) + struct perf_sample *sample) { return event__parse_sample(event, session->sample_type, session->sample_id_all, sample); From 8115d60c323dd9931b95221c0a392aeddc1d6ef3 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 29 Jan 2011 14:01:45 -0200 Subject: [PATCH 097/706] perf tools: Kill event_t typedef, use 'union perf_event' instead And move the event_t methods to the perf_event__ too. No code changes, just namespace consistency. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 11 +- tools/perf/builtin-diff.c | 14 +-- tools/perf/builtin-inject.c | 82 ++++++++------- tools/perf/builtin-kmem.c | 10 +- tools/perf/builtin-lock.c | 6 +- tools/perf/builtin-record.c | 56 +++++----- tools/perf/builtin-report.c | 30 +++--- tools/perf/builtin-sched.c | 15 +-- tools/perf/builtin-script.c | 17 +-- tools/perf/builtin-test.c | 6 +- tools/perf/builtin-timechart.c | 11 +- tools/perf/builtin-top.c | 33 +++--- tools/perf/util/build-id.c | 19 ++-- tools/perf/util/callchain.c | 3 +- tools/perf/util/callchain.h | 4 +- tools/perf/util/debug.c | 2 +- tools/perf/util/debug.h | 2 +- tools/perf/util/event.c | 187 +++++++++++++++++---------------- tools/perf/util/event.h | 69 ++++++------ tools/perf/util/evlist.c | 6 +- tools/perf/util/evlist.h | 4 +- tools/perf/util/evsel.c | 10 +- tools/perf/util/header.c | 83 ++++++++------- tools/perf/util/header.h | 50 ++++----- tools/perf/util/hist.c | 2 +- tools/perf/util/session.c | 165 ++++++++++++++--------------- tools/perf/util/session.h | 13 +-- 27 files changed, 473 insertions(+), 437 deletions(-) diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index ef3675135414..70067862e07f 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -58,12 +58,13 @@ static int hists__add_entry(struct hists *self, struct addr_location *al) return hist_entry__inc_addr_samples(he, al->addr); } -static int process_sample_event(event_t *event, struct perf_sample *sample, +static int process_sample_event(union perf_event *event, + struct perf_sample *sample, struct perf_session *session) { struct addr_location al; - if (event__preprocess_sample(event, session, &al, sample, NULL) < 0) { + if (perf_event__preprocess_sample(event, session, &al, sample, NULL) < 0) { pr_warning("problem processing %d event, skipping it.\n", event->header.type); return -1; @@ -372,9 +373,9 @@ find_next: static struct perf_event_ops event_ops = { .sample = process_sample_event, - .mmap = event__process_mmap, - .comm = event__process_comm, - .fork = event__process_task, + .mmap = perf_event__process_mmap, + .comm = perf_event__process_comm, + .fork = perf_event__process_task, .ordered_samples = true, .ordering_requires_timestamps = true, }; diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c index 0822149dc768..6b7d91160ecb 100644 --- a/tools/perf/builtin-diff.c +++ b/tools/perf/builtin-diff.c @@ -30,13 +30,13 @@ static int hists__add_entry(struct hists *self, return -ENOMEM; } -static int diff__process_sample_event(event_t *event, +static int diff__process_sample_event(union perf_event *event, struct perf_sample *sample, struct perf_session *session) { struct addr_location al; - if (event__preprocess_sample(event, session, &al, sample, NULL) < 0) { + if (perf_event__preprocess_sample(event, session, &al, sample, NULL) < 0) { pr_warning("problem processing %d event, skipping it.\n", event->header.type); return -1; @@ -56,11 +56,11 @@ static int diff__process_sample_event(event_t *event, static struct perf_event_ops event_ops = { .sample = diff__process_sample_event, - .mmap = event__process_mmap, - .comm = event__process_comm, - .exit = event__process_task, - .fork = event__process_task, - .lost = event__process_lost, + .mmap = perf_event__process_mmap, + .comm = perf_event__process_comm, + .exit = perf_event__process_task, + .fork = perf_event__process_task, + .lost = perf_event__process_lost, .ordered_samples = true, .ordering_requires_timestamps = true, }; diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 4c9388ce878c..e29f04ed3396 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -16,8 +16,8 @@ static char const *input_name = "-"; static bool inject_build_ids; -static int event__repipe_synth(event_t *event, - struct perf_session *session __used) +static int perf_event__repipe_synth(union perf_event *event, + struct perf_session *session __used) { uint32_t size; void *buf = event; @@ -36,41 +36,44 @@ static int event__repipe_synth(event_t *event, return 0; } -static int event__repipe(event_t *event, struct perf_sample *sample __used, - struct perf_session *session) +static int perf_event__repipe(union perf_event *event, + struct perf_sample *sample __used, + struct perf_session *session) { - return event__repipe_synth(event, session); + return perf_event__repipe_synth(event, session); } -static int event__repipe_mmap(event_t *self, struct perf_sample *sample, - struct perf_session *session) +static int perf_event__repipe_mmap(union perf_event *event, + struct perf_sample *sample, + struct perf_session *session) { int err; - err = event__process_mmap(self, sample, session); - event__repipe(self, sample, session); + err = perf_event__process_mmap(event, sample, session); + perf_event__repipe(event, sample, session); return err; } -static int event__repipe_task(event_t *self, struct perf_sample *sample, - struct perf_session *session) +static int perf_event__repipe_task(union perf_event *event, + struct perf_sample *sample, + struct perf_session *session) { int err; - err = event__process_task(self, sample, session); - event__repipe(self, sample, session); + err = perf_event__process_task(event, sample, session); + perf_event__repipe(event, sample, session); return err; } -static int event__repipe_tracing_data(event_t *self, - struct perf_session *session) +static int perf_event__repipe_tracing_data(union perf_event *event, + struct perf_session *session) { int err; - event__repipe_synth(self, session); - err = event__process_tracing_data(self, session); + perf_event__repipe_synth(event, session); + err = perf_event__process_tracing_data(event, session); return err; } @@ -109,8 +112,8 @@ static int dso__inject_build_id(struct dso *self, struct perf_session *session) if (self->kernel) misc = PERF_RECORD_MISC_KERNEL; - err = event__synthesize_build_id(self, misc, event__repipe, - machine, session); + err = perf_event__synthesize_build_id(self, misc, perf_event__repipe, + machine, session); if (err) { pr_err("Can't synthesize build_id event for %s\n", self->long_name); return -1; @@ -119,8 +122,9 @@ static int dso__inject_build_id(struct dso *self, struct perf_session *session) return 0; } -static int event__inject_buildid(event_t *event, struct perf_sample *sample, - struct perf_session *session) +static int perf_event__inject_buildid(union perf_event *event, + struct perf_sample *sample, + struct perf_session *session) { struct addr_location al; struct thread *thread; @@ -155,24 +159,24 @@ static int event__inject_buildid(event_t *event, struct perf_sample *sample, } repipe: - event__repipe(event, sample, session); + perf_event__repipe(event, sample, session); return 0; } struct perf_event_ops inject_ops = { - .sample = event__repipe, - .mmap = event__repipe, - .comm = event__repipe, - .fork = event__repipe, - .exit = event__repipe, - .lost = event__repipe, - .read = event__repipe, - .throttle = event__repipe, - .unthrottle = event__repipe, - .attr = event__repipe_synth, - .event_type = event__repipe_synth, - .tracing_data = event__repipe_synth, - .build_id = event__repipe_synth, + .sample = perf_event__repipe, + .mmap = perf_event__repipe, + .comm = perf_event__repipe, + .fork = perf_event__repipe, + .exit = perf_event__repipe, + .lost = perf_event__repipe, + .read = perf_event__repipe, + .throttle = perf_event__repipe, + .unthrottle = perf_event__repipe, + .attr = perf_event__repipe_synth, + .event_type = perf_event__repipe_synth, + .tracing_data = perf_event__repipe_synth, + .build_id = perf_event__repipe_synth, }; extern volatile int session_done; @@ -190,10 +194,10 @@ static int __cmd_inject(void) signal(SIGINT, sig_handler); if (inject_build_ids) { - inject_ops.sample = event__inject_buildid; - inject_ops.mmap = event__repipe_mmap; - inject_ops.fork = event__repipe_task; - inject_ops.tracing_data = event__repipe_tracing_data; + inject_ops.sample = perf_event__inject_buildid; + inject_ops.mmap = perf_event__repipe_mmap; + inject_ops.fork = perf_event__repipe_task; + inject_ops.tracing_data = perf_event__repipe_tracing_data; } session = perf_session__new(input_name, O_RDONLY, false, true, &inject_ops); diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 3c1cdcf29902..7f618f4e7b79 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -275,9 +275,8 @@ static void process_free_event(void *data, s_alloc->alloc_cpu = -1; } -static void -process_raw_event(event_t *raw_event __used, void *data, - int cpu, u64 timestamp, struct thread *thread) +static void process_raw_event(union perf_event *raw_event __used, void *data, + int cpu, u64 timestamp, struct thread *thread) { struct event *event; int type; @@ -304,7 +303,8 @@ process_raw_event(event_t *raw_event __used, void *data, } } -static int process_sample_event(event_t *event, struct perf_sample *sample, +static int process_sample_event(union perf_event *event, + struct perf_sample *sample, struct perf_session *session) { struct thread *thread = perf_session__findnew(session, event->ip.pid); @@ -325,7 +325,7 @@ static int process_sample_event(event_t *event, struct perf_sample *sample, static struct perf_event_ops event_ops = { .sample = process_sample_event, - .comm = event__process_comm, + .comm = perf_event__process_comm, .ordered_samples = true, }; diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index c3f512791344..e00d93847c44 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -834,14 +834,14 @@ static void dump_info(void) die("Unknown type of information\n"); } -static int process_sample_event(event_t *self, struct perf_sample *sample, +static int process_sample_event(union perf_event *event, struct perf_sample *sample, struct perf_session *s) { struct thread *thread = perf_session__findnew(s, sample->tid); if (thread == NULL) { pr_debug("problem processing %d event, skipping it.\n", - self->header.type); + event->header.type); return -1; } @@ -852,7 +852,7 @@ static int process_sample_event(event_t *self, struct perf_sample *sample, static struct perf_event_ops eops = { .sample = process_sample_event, - .comm = event__process_comm, + .comm = perf_event__process_comm, .ordered_samples = true, }; diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 5d3e4b32072b..edc3555098c8 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -100,7 +100,7 @@ static void write_output(void *buf, size_t size) } } -static int process_synthesized_event(event_t *event, +static int process_synthesized_event(union perf_event *event, struct perf_sample *sample __used, struct perf_session *self __used) { @@ -404,7 +404,7 @@ static void atexit_header(void) } } -static void event__synthesize_guest_os(struct machine *machine, void *data) +static void perf_event__synthesize_guest_os(struct machine *machine, void *data) { int err; struct perf_session *psession = data; @@ -420,8 +420,8 @@ static void event__synthesize_guest_os(struct machine *machine, void *data) *method is used to avoid symbol missing when the first addr is *in module instead of in guest kernel. */ - err = event__synthesize_modules(process_synthesized_event, - psession, machine); + err = perf_event__synthesize_modules(process_synthesized_event, + psession, machine); if (err < 0) pr_err("Couldn't record guest kernel [%d]'s reference" " relocation symbol.\n", machine->pid); @@ -430,11 +430,12 @@ static void event__synthesize_guest_os(struct machine *machine, void *data) * We use _stext for guest kernel because guest kernel's /proc/kallsyms * have no _text sometimes. */ - err = event__synthesize_kernel_mmap(process_synthesized_event, - psession, machine, "_text"); + err = perf_event__synthesize_kernel_mmap(process_synthesized_event, + psession, machine, "_text"); if (err < 0) - err = event__synthesize_kernel_mmap(process_synthesized_event, - psession, machine, "_stext"); + err = perf_event__synthesize_kernel_mmap(process_synthesized_event, + psession, machine, + "_stext"); if (err < 0) pr_err("Couldn't record guest kernel [%d]'s reference" " relocation symbol.\n", machine->pid); @@ -617,16 +618,16 @@ static int __cmd_record(int argc, const char **argv) perf_session__set_sample_id_all(session, sample_id_all_avail); if (pipe_output) { - err = event__synthesize_attrs(&session->header, - process_synthesized_event, - session); + err = perf_event__synthesize_attrs(&session->header, + process_synthesized_event, + session); if (err < 0) { pr_err("Couldn't synthesize attrs.\n"); return err; } - err = event__synthesize_event_types(process_synthesized_event, - session); + err = perf_event__synthesize_event_types(process_synthesized_event, + session); if (err < 0) { pr_err("Couldn't synthesize event_types.\n"); return err; @@ -641,9 +642,9 @@ static int __cmd_record(int argc, const char **argv) * return this more properly and also * propagate errors that now are calling die() */ - err = event__synthesize_tracing_data(output, evsel_list, - process_synthesized_event, - session); + err = perf_event__synthesize_tracing_data(output, evsel_list, + process_synthesized_event, + session); if (err <= 0) { pr_err("Couldn't record tracing data.\n"); return err; @@ -658,31 +659,34 @@ static int __cmd_record(int argc, const char **argv) return -1; } - err = event__synthesize_kernel_mmap(process_synthesized_event, - session, machine, "_text"); + err = perf_event__synthesize_kernel_mmap(process_synthesized_event, + session, machine, "_text"); if (err < 0) - err = event__synthesize_kernel_mmap(process_synthesized_event, - session, machine, "_stext"); + err = perf_event__synthesize_kernel_mmap(process_synthesized_event, + session, machine, "_stext"); if (err < 0) pr_err("Couldn't record kernel reference relocation symbol\n" "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n" "Check /proc/kallsyms permission or run as root.\n"); - err = event__synthesize_modules(process_synthesized_event, - session, machine); + err = perf_event__synthesize_modules(process_synthesized_event, + session, machine); if (err < 0) pr_err("Couldn't record kernel module information.\n" "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n" "Check /proc/modules permission or run as root.\n"); if (perf_guest) - perf_session__process_machines(session, event__synthesize_guest_os); + perf_session__process_machines(session, + perf_event__synthesize_guest_os); if (!system_wide) - event__synthesize_thread(target_tid, process_synthesized_event, - session); + perf_event__synthesize_thread(target_tid, + process_synthesized_event, + session); else - event__synthesize_threads(process_synthesized_event, session); + perf_event__synthesize_threads(process_synthesized_event, + session); if (realtime_prio) { struct sched_param param; diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index bbbadcc04097..a6a4e5457b6f 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -149,13 +149,14 @@ static int add_event_total(struct perf_session *session, return 0; } -static int process_sample_event(event_t *event, struct perf_sample *sample, +static int process_sample_event(union perf_event *event, + struct perf_sample *sample, struct perf_session *session) { struct addr_location al; struct perf_event_attr *attr; - if (event__preprocess_sample(event, session, &al, sample, NULL) < 0) { + if (perf_event__preprocess_sample(event, session, &al, sample, NULL) < 0) { fprintf(stderr, "problem processing %d event, skipping it.\n", event->header.type); return -1; @@ -179,7 +180,8 @@ static int process_sample_event(event_t *event, struct perf_sample *sample, return 0; } -static int process_read_event(event_t *event, struct perf_sample *sample __used, +static int process_read_event(union perf_event *event, + struct perf_sample *sample __used, struct perf_session *session __used) { struct perf_event_attr *attr; @@ -232,17 +234,17 @@ static int perf_session__setup_sample_type(struct perf_session *self) } static struct perf_event_ops event_ops = { - .sample = process_sample_event, - .mmap = event__process_mmap, - .comm = event__process_comm, - .exit = event__process_task, - .fork = event__process_task, - .lost = event__process_lost, - .read = process_read_event, - .attr = event__process_attr, - .event_type = event__process_event_type, - .tracing_data = event__process_tracing_data, - .build_id = event__process_build_id, + .sample = process_sample_event, + .mmap = perf_event__process_mmap, + .comm = perf_event__process_comm, + .exit = perf_event__process_task, + .fork = perf_event__process_task, + .lost = perf_event__process_lost, + .read = process_read_event, + .attr = perf_event__process_attr, + .event_type = perf_event__process_event_type, + .tracing_data = perf_event__process_tracing_data, + .build_id = perf_event__process_build_id, .ordered_samples = true, .ordering_requires_timestamps = true, }; diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index ff993c8b175d..ae2621182927 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1580,9 +1580,9 @@ process_sched_migrate_task_event(void *data, struct perf_session *session, event, cpu, timestamp, thread); } -static void -process_raw_event(event_t *raw_event __used, struct perf_session *session, - void *data, int cpu, u64 timestamp, struct thread *thread) +static void process_raw_event(union perf_event *raw_event __used, + struct perf_session *session, void *data, int cpu, + u64 timestamp, struct thread *thread) { struct event *event; int type; @@ -1607,7 +1607,8 @@ process_raw_event(event_t *raw_event __used, struct perf_session *session, process_sched_migrate_task_event(data, session, event, cpu, timestamp, thread); } -static int process_sample_event(event_t *event, struct perf_sample *sample, +static int process_sample_event(union perf_event *event, + struct perf_sample *sample, struct perf_session *session) { struct thread *thread; @@ -1635,9 +1636,9 @@ static int process_sample_event(event_t *event, struct perf_sample *sample, static struct perf_event_ops event_ops = { .sample = process_sample_event, - .comm = event__process_comm, - .lost = event__process_lost, - .fork = event__process_task, + .comm = perf_event__process_comm, + .lost = perf_event__process_lost, + .fork = perf_event__process_task, .ordered_samples = true, }; diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 5c4c809008af..5f40df635dcb 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -63,7 +63,8 @@ static int cleanup_scripting(void) static char const *input_name = "perf.data"; -static int process_sample_event(event_t *event, struct perf_sample *sample, +static int process_sample_event(union perf_event *event, + struct perf_sample *sample, struct perf_session *session) { struct thread *thread = perf_session__findnew(session, event->ip.pid); @@ -100,14 +101,14 @@ static int process_sample_event(event_t *event, struct perf_sample *sample, } static struct perf_event_ops event_ops = { - .sample = process_sample_event, - .comm = event__process_comm, - .attr = event__process_attr, - .event_type = event__process_event_type, - .tracing_data = event__process_tracing_data, - .build_id = event__process_build_id, - .ordering_requires_timestamps = true, + .sample = process_sample_event, + .comm = perf_event__process_comm, + .attr = perf_event__process_attr, + .event_type = perf_event__process_event_type, + .tracing_data = perf_event__process_tracing_data, + .build_id = perf_event__process_build_id, .ordered_samples = true, + .ordering_requires_timestamps = true, }; extern volatile int session_done; diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c index df62433a34a7..845b9bd54ed4 100644 --- a/tools/perf/builtin-test.c +++ b/tools/perf/builtin-test.c @@ -454,7 +454,7 @@ out_thread_map_delete: static int test__basic_mmap(void) { int err = -1; - event_t *event; + union perf_event *event; struct thread_map *threads; struct cpu_map *cpus; struct perf_evlist *evlist; @@ -554,11 +554,11 @@ static int test__basic_mmap(void) if (event->header.type != PERF_RECORD_SAMPLE) { pr_debug("unexpected %s event\n", - event__get_event_name(event->header.type)); + perf_event__name(event->header.type)); goto out_munmap; } - event__parse_sample(event, attr.sample_type, false, &sample); + perf_event__parse_sample(event, attr.sample_type, false, &sample); evsel = perf_evlist__id2evsel(evlist, sample.id); if (evsel == NULL) { pr_debug("event with id %" PRIu64 diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c index 01cf0c3771a6..0801275c500e 100644 --- a/tools/perf/builtin-timechart.c +++ b/tools/perf/builtin-timechart.c @@ -276,21 +276,24 @@ static int cpus_cstate_state[MAX_CPUS]; static u64 cpus_pstate_start_times[MAX_CPUS]; static u64 cpus_pstate_state[MAX_CPUS]; -static int process_comm_event(event_t *event, struct perf_sample *sample __used, +static int process_comm_event(union perf_event *event, + struct perf_sample *sample __used, struct perf_session *session __used) { pid_set_comm(event->comm.tid, event->comm.comm); return 0; } -static int process_fork_event(event_t *event, struct perf_sample *sample __used, +static int process_fork_event(union perf_event *event, + struct perf_sample *sample __used, struct perf_session *session __used) { pid_fork(event->fork.pid, event->fork.ppid, event->fork.time); return 0; } -static int process_exit_event(event_t *event, struct perf_sample *sample __used, +static int process_exit_event(union perf_event *event, + struct perf_sample *sample __used, struct perf_session *session __used) { pid_exit(event->fork.pid, event->fork.time); @@ -486,7 +489,7 @@ static void sched_switch(int cpu, u64 timestamp, struct trace_entry *te) } -static int process_sample_event(event_t *event __used, +static int process_sample_event(union perf_event *event __used, struct perf_sample *sample, struct perf_session *session) { diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index d923127b41b6..2f4d1f244be1 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -401,7 +401,7 @@ static void show_details(struct sym_entry *syme) } /* - * Symbols will be added here in event__process_sample and will get out + * Symbols will be added here in perf_event__process_sample and will get out * after decayed. */ static LIST_HEAD(active_symbols); @@ -996,15 +996,15 @@ static int symbol_filter(struct map *map, struct symbol *sym) return 0; } -static void event__process_sample(const event_t *self, - struct perf_sample *sample, - struct perf_session *session) +static void perf_event__process_sample(const union perf_event *event, + struct perf_sample *sample, + struct perf_session *session) { - u64 ip = self->ip.ip; + u64 ip = event->ip.ip; struct sym_entry *syme; struct addr_location al; struct machine *machine; - u8 origin = self->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; + u8 origin = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; ++samples; @@ -1023,7 +1023,7 @@ static void event__process_sample(const event_t *self, break; case PERF_RECORD_MISC_GUEST_KERNEL: ++guest_kernel_samples; - machine = perf_session__find_machine(session, self->ip.pid); + machine = perf_session__find_machine(session, event->ip.pid); break; case PERF_RECORD_MISC_GUEST_USER: ++guest_us_samples; @@ -1038,15 +1038,15 @@ static void event__process_sample(const event_t *self, if (!machine && perf_guest) { pr_err("Can't find guest [%d]'s kernel information\n", - self->ip.pid); + event->ip.pid); return; } - if (self->header.misc & PERF_RECORD_MISC_EXACT_IP) + if (event->header.misc & PERF_RECORD_MISC_EXACT_IP) exact_samples++; - if (event__preprocess_sample(self, session, &al, sample, - symbol_filter) < 0 || + if (perf_event__preprocess_sample(event, session, &al, sample, + symbol_filter) < 0 || al.filtered) return; @@ -1108,15 +1108,15 @@ static void event__process_sample(const event_t *self, static void perf_session__mmap_read_cpu(struct perf_session *self, int cpu) { struct perf_sample sample; - event_t *event; + union perf_event *event; while ((event = perf_evlist__read_on_cpu(evsel_list, cpu)) != NULL) { perf_session__parse_sample(self, event, &sample); if (event->header.type == PERF_RECORD_SAMPLE) - event__process_sample(event, &sample, self); + perf_event__process_sample(event, &sample, self); else - event__process(event, &sample, self); + perf_event__process(event, &sample, self); } } @@ -1199,9 +1199,10 @@ static int __cmd_top(void) return -ENOMEM; if (target_tid != -1) - event__synthesize_thread(target_tid, event__process, session); + perf_event__synthesize_thread(target_tid, perf_event__process, + session); else - event__synthesize_threads(event__process, session); + perf_event__synthesize_threads(perf_event__process, session); start_counters(evsel_list); first = list_entry(evsel_list->entries.next, struct perf_evsel, node); diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c index b184a7fa0843..31f934af9861 100644 --- a/tools/perf/util/build-id.c +++ b/tools/perf/util/build-id.c @@ -14,7 +14,7 @@ #include #include "debug.h" -static int build_id__mark_dso_hit(event_t *event, +static int build_id__mark_dso_hit(union perf_event *event, struct perf_sample *sample __used, struct perf_session *session) { @@ -37,13 +37,14 @@ static int build_id__mark_dso_hit(event_t *event, return 0; } -static int event__exit_del_thread(event_t *self, struct perf_sample *sample __used, - struct perf_session *session) +static int perf_event__exit_del_thread(union perf_event *event, + struct perf_sample *sample __used, + struct perf_session *session) { - struct thread *thread = perf_session__findnew(session, self->fork.tid); + struct thread *thread = perf_session__findnew(session, event->fork.tid); - dump_printf("(%d:%d):(%d:%d)\n", self->fork.pid, self->fork.tid, - self->fork.ppid, self->fork.ptid); + dump_printf("(%d:%d):(%d:%d)\n", event->fork.pid, event->fork.tid, + event->fork.ppid, event->fork.ptid); if (thread) { rb_erase(&thread->rb_node, &session->threads); @@ -56,9 +57,9 @@ static int event__exit_del_thread(event_t *self, struct perf_sample *sample __us struct perf_event_ops build_id__mark_dso_hit_ops = { .sample = build_id__mark_dso_hit, - .mmap = event__process_mmap, - .fork = event__process_task, - .exit = event__exit_del_thread, + .mmap = perf_event__process_mmap, + .fork = perf_event__process_task, + .exit = perf_event__exit_del_thread, }; char *dso__build_id_filename(struct dso *self, char *bf, size_t size) diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index f8c66d1435e0..9f7106a8d9a4 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -18,7 +18,8 @@ #include "util.h" #include "callchain.h" -bool ip_callchain__valid(struct ip_callchain *chain, const event_t *event) +bool ip_callchain__valid(struct ip_callchain *chain, + const union perf_event *event) { unsigned int chain_size = event->header.size; chain_size -= (unsigned long)&event->ip.__more_data - (unsigned long)event; diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index 67137256a1cd..1a79df9f739f 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -95,8 +95,8 @@ int callchain_append(struct callchain_root *root, int callchain_merge(struct callchain_cursor *cursor, struct callchain_root *dst, struct callchain_root *src); -bool ip_callchain__valid(struct ip_callchain *chain, const event_t *event); - +bool ip_callchain__valid(struct ip_callchain *chain, + const union perf_event *event); /* * Initialize a cursor before adding entries inside, but keep * the previously allocated entries as a cache. diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c index 01bbe8ecec3f..d4536a9e0d8c 100644 --- a/tools/perf/util/debug.c +++ b/tools/perf/util/debug.c @@ -57,7 +57,7 @@ void ui__warning(const char *format, ...) } #endif -void trace_event(event_t *event) +void trace_event(union perf_event *event) { unsigned char *raw_event = (void *)event; const char *color = PERF_COLOR_BLUE; diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h index ca35fd66b5df..93516cf4682c 100644 --- a/tools/perf/util/debug.h +++ b/tools/perf/util/debug.h @@ -9,7 +9,7 @@ extern int verbose; extern bool quiet, dump_trace; int dump_printf(const char *fmt, ...) __attribute__((format(printf, 1, 2))); -void trace_event(event_t *event); +void trace_event(union perf_event *event); struct ui_progress; diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 5c886fbd50ce..731265f4ad19 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -7,7 +7,7 @@ #include "strlist.h" #include "thread.h" -static const char *event__name[] = { +static const char *perf_event__names[] = { [0] = "TOTAL", [PERF_RECORD_MMAP] = "MMAP", [PERF_RECORD_LOST] = "LOST", @@ -25,13 +25,13 @@ static const char *event__name[] = { [PERF_RECORD_FINISHED_ROUND] = "FINISHED_ROUND", }; -const char *event__get_event_name(unsigned int id) +const char *perf_event__name(unsigned int id) { - if (id >= ARRAY_SIZE(event__name)) + if (id >= ARRAY_SIZE(perf_event__names)) return "INVALID"; - if (!event__name[id]) + if (!perf_event__names[id]) return "UNKNOWN"; - return event__name[id]; + return perf_event__names[id]; } static struct perf_sample synth_sample = { @@ -43,9 +43,9 @@ static struct perf_sample synth_sample = { .period = 1, }; -static pid_t event__synthesize_comm(event_t *event, pid_t pid, int full, - event__handler_t process, - struct perf_session *session) +static pid_t perf_event__synthesize_comm(union perf_event *event, pid_t pid, + int full, perf_event__handler_t process, + struct perf_session *session) { char filename[PATH_MAX]; char bf[BUFSIZ]; @@ -126,9 +126,10 @@ out: return tgid; } -static int event__synthesize_mmap_events(event_t *event, pid_t pid, pid_t tgid, - event__handler_t process, - struct perf_session *session) +static int perf_event__synthesize_mmap_events(union perf_event *event, + pid_t pid, pid_t tgid, + perf_event__handler_t process, + struct perf_session *session) { char filename[PATH_MAX]; FILE *fp; @@ -199,14 +200,14 @@ static int event__synthesize_mmap_events(event_t *event, pid_t pid, pid_t tgid, return 0; } -int event__synthesize_modules(event__handler_t process, - struct perf_session *session, - struct machine *machine) +int perf_event__synthesize_modules(perf_event__handler_t process, + struct perf_session *session, + struct machine *machine) { struct rb_node *nd; struct map_groups *kmaps = &machine->kmaps; - event_t *event = zalloc(sizeof(event->mmap) + session->id_hdr_size); - + union perf_event *event = zalloc((sizeof(event->mmap) + + session->id_hdr_size)); if (event == NULL) { pr_debug("Not enough memory synthesizing mmap event " "for kernel modules\n"); @@ -251,22 +252,23 @@ int event__synthesize_modules(event__handler_t process, return 0; } -static int __event__synthesize_thread(event_t *comm_event, event_t *mmap_event, - pid_t pid, event__handler_t process, +static int __event__synthesize_thread(union perf_event *comm_event, + union perf_event *mmap_event, + pid_t pid, perf_event__handler_t process, struct perf_session *session) { - pid_t tgid = event__synthesize_comm(comm_event, pid, 1, process, + pid_t tgid = perf_event__synthesize_comm(comm_event, pid, 1, process, session); if (tgid == -1) return -1; - return event__synthesize_mmap_events(mmap_event, pid, tgid, + return perf_event__synthesize_mmap_events(mmap_event, pid, tgid, process, session); } -int event__synthesize_thread(pid_t pid, event__handler_t process, - struct perf_session *session) +int perf_event__synthesize_thread(pid_t pid, perf_event__handler_t process, + struct perf_session *session) { - event_t *comm_event, *mmap_event; + union perf_event *comm_event, *mmap_event; int err = -1; comm_event = malloc(sizeof(comm_event->comm) + session->id_hdr_size); @@ -286,12 +288,12 @@ out: return err; } -int event__synthesize_threads(event__handler_t process, - struct perf_session *session) +int perf_event__synthesize_threads(perf_event__handler_t process, + struct perf_session *session) { DIR *proc; struct dirent dirent, *next; - event_t *comm_event, *mmap_event; + union perf_event *comm_event, *mmap_event; int err = -1; comm_event = malloc(sizeof(comm_event->comm) + session->id_hdr_size); @@ -349,10 +351,10 @@ static int find_symbol_cb(void *arg, const char *name, char type, return 1; } -int event__synthesize_kernel_mmap(event__handler_t process, - struct perf_session *session, - struct machine *machine, - const char *symbol_name) +int perf_event__synthesize_kernel_mmap(perf_event__handler_t process, + struct perf_session *session, + struct machine *machine, + const char *symbol_name) { size_t size; const char *filename, *mmap_name; @@ -366,8 +368,8 @@ int event__synthesize_kernel_mmap(event__handler_t process, * kernels. */ struct process_symbol_args args = { .name = symbol_name, }; - event_t *event = zalloc(sizeof(event->mmap) + session->id_hdr_size); - + union perf_event *event = zalloc((sizeof(event->mmap) + + session->id_hdr_size)); if (event == NULL) { pr_debug("Not enough memory synthesizing mmap event " "for kernel modules\n"); @@ -440,14 +442,15 @@ static int thread__set_comm_adjust(struct thread *self, const char *comm, return 0; } -int event__process_comm(event_t *self, struct perf_sample *sample __used, - struct perf_session *session) +int perf_event__process_comm(union perf_event *event, + struct perf_sample *sample __used, + struct perf_session *session) { - struct thread *thread = perf_session__findnew(session, self->comm.tid); + struct thread *thread = perf_session__findnew(session, event->comm.tid); - dump_printf(": %s:%d\n", self->comm.comm, self->comm.tid); + dump_printf(": %s:%d\n", event->comm.comm, event->comm.tid); - if (thread == NULL || thread__set_comm_adjust(thread, self->comm.comm, + if (thread == NULL || thread__set_comm_adjust(thread, event->comm.comm, &session->hists)) { dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n"); return -1; @@ -456,19 +459,21 @@ int event__process_comm(event_t *self, struct perf_sample *sample __used, return 0; } -int event__process_lost(event_t *self, struct perf_sample *sample __used, - struct perf_session *session) +int perf_event__process_lost(union perf_event *event, + struct perf_sample *sample __used, + struct perf_session *session) { dump_printf(": id:%" PRIu64 ": lost:%" PRIu64 "\n", - self->lost.id, self->lost.lost); - session->hists.stats.total_lost += self->lost.lost; + event->lost.id, event->lost.lost); + session->hists.stats.total_lost += event->lost.lost; return 0; } -static void event_set_kernel_mmap_len(struct map **maps, event_t *self) +static void perf_event__set_kernel_mmap_len(union perf_event *event, + struct map **maps) { - maps[MAP__FUNCTION]->start = self->mmap.start; - maps[MAP__FUNCTION]->end = self->mmap.start + self->mmap.len; + maps[MAP__FUNCTION]->start = event->mmap.start; + maps[MAP__FUNCTION]->end = event->mmap.start + event->mmap.len; /* * Be a bit paranoid here, some perf.data file came with * a zero sized synthesized MMAP event for the kernel. @@ -477,8 +482,8 @@ static void event_set_kernel_mmap_len(struct map **maps, event_t *self) maps[MAP__FUNCTION]->end = ~0ULL; } -static int event__process_kernel_mmap(event_t *self, - struct perf_session *session) +static int perf_event__process_kernel_mmap(union perf_event *event, + struct perf_session *session) { struct map *map; char kmmap_prefix[PATH_MAX]; @@ -486,9 +491,9 @@ static int event__process_kernel_mmap(event_t *self, enum dso_kernel_type kernel_type; bool is_kernel_mmap; - machine = perf_session__findnew_machine(session, self->mmap.pid); + machine = perf_session__findnew_machine(session, event->mmap.pid); if (!machine) { - pr_err("Can't find id %d's machine\n", self->mmap.pid); + pr_err("Can't find id %d's machine\n", event->mmap.pid); goto out_problem; } @@ -498,17 +503,17 @@ static int event__process_kernel_mmap(event_t *self, else kernel_type = DSO_TYPE_GUEST_KERNEL; - is_kernel_mmap = memcmp(self->mmap.filename, + is_kernel_mmap = memcmp(event->mmap.filename, kmmap_prefix, strlen(kmmap_prefix)) == 0; - if (self->mmap.filename[0] == '/' || - (!is_kernel_mmap && self->mmap.filename[0] == '[')) { + if (event->mmap.filename[0] == '/' || + (!is_kernel_mmap && event->mmap.filename[0] == '[')) { char short_module_name[1024]; char *name, *dot; - if (self->mmap.filename[0] == '/') { - name = strrchr(self->mmap.filename, '/'); + if (event->mmap.filename[0] == '/') { + name = strrchr(event->mmap.filename, '/'); if (name == NULL) goto out_problem; @@ -520,10 +525,10 @@ static int event__process_kernel_mmap(event_t *self, "[%.*s]", (int)(dot - name), name); strxfrchar(short_module_name, '-', '_'); } else - strcpy(short_module_name, self->mmap.filename); + strcpy(short_module_name, event->mmap.filename); - map = machine__new_module(machine, self->mmap.start, - self->mmap.filename); + map = machine__new_module(machine, event->mmap.start, + event->mmap.filename); if (map == NULL) goto out_problem; @@ -533,9 +538,9 @@ static int event__process_kernel_mmap(event_t *self, map->dso->short_name = name; map->dso->sname_alloc = 1; - map->end = map->start + self->mmap.len; + map->end = map->start + event->mmap.len; } else if (is_kernel_mmap) { - const char *symbol_name = (self->mmap.filename + + const char *symbol_name = (event->mmap.filename + strlen(kmmap_prefix)); /* * Should be there already, from the build-id table in @@ -550,10 +555,10 @@ static int event__process_kernel_mmap(event_t *self, if (__machine__create_kernel_maps(machine, kernel) < 0) goto out_problem; - event_set_kernel_mmap_len(machine->vmlinux_maps, self); + perf_event__set_kernel_mmap_len(event, machine->vmlinux_maps); perf_session__set_kallsyms_ref_reloc_sym(machine->vmlinux_maps, symbol_name, - self->mmap.pgoff); + event->mmap.pgoff); if (machine__is_default_guest(machine)) { /* * preload dso of guest kernel and modules @@ -567,22 +572,23 @@ out_problem: return -1; } -int event__process_mmap(event_t *self, struct perf_sample *sample __used, - struct perf_session *session) +int perf_event__process_mmap(union perf_event *event, + struct perf_sample *sample __used, + struct perf_session *session) { struct machine *machine; struct thread *thread; struct map *map; - u8 cpumode = self->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; + u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; int ret = 0; dump_printf(" %d/%d: [%#" PRIx64 "(%#" PRIx64 ") @ %#" PRIx64 "]: %s\n", - self->mmap.pid, self->mmap.tid, self->mmap.start, - self->mmap.len, self->mmap.pgoff, self->mmap.filename); + event->mmap.pid, event->mmap.tid, event->mmap.start, + event->mmap.len, event->mmap.pgoff, event->mmap.filename); if (cpumode == PERF_RECORD_MISC_GUEST_KERNEL || cpumode == PERF_RECORD_MISC_KERNEL) { - ret = event__process_kernel_mmap(self, session); + ret = perf_event__process_kernel_mmap(event, session); if (ret < 0) goto out_problem; return 0; @@ -591,12 +597,12 @@ int event__process_mmap(event_t *self, struct perf_sample *sample __used, machine = perf_session__find_host_machine(session); if (machine == NULL) goto out_problem; - thread = perf_session__findnew(session, self->mmap.pid); + thread = perf_session__findnew(session, event->mmap.pid); if (thread == NULL) goto out_problem; - map = map__new(&machine->user_dsos, self->mmap.start, - self->mmap.len, self->mmap.pgoff, - self->mmap.pid, self->mmap.filename, + map = map__new(&machine->user_dsos, event->mmap.start, + event->mmap.len, event->mmap.pgoff, + event->mmap.pid, event->mmap.filename, MAP__FUNCTION); if (map == NULL) goto out_problem; @@ -609,16 +615,17 @@ out_problem: return 0; } -int event__process_task(event_t *self, struct perf_sample *sample __used, - struct perf_session *session) +int perf_event__process_task(union perf_event *event, + struct perf_sample *sample __used, + struct perf_session *session) { - struct thread *thread = perf_session__findnew(session, self->fork.tid); - struct thread *parent = perf_session__findnew(session, self->fork.ptid); + struct thread *thread = perf_session__findnew(session, event->fork.tid); + struct thread *parent = perf_session__findnew(session, event->fork.ptid); - dump_printf("(%d:%d):(%d:%d)\n", self->fork.pid, self->fork.tid, - self->fork.ppid, self->fork.ptid); + dump_printf("(%d:%d):(%d:%d)\n", event->fork.pid, event->fork.tid, + event->fork.ppid, event->fork.ptid); - if (self->header.type == PERF_RECORD_EXIT) { + if (event->header.type == PERF_RECORD_EXIT) { perf_session__remove_thread(session, thread); return 0; } @@ -632,22 +639,22 @@ int event__process_task(event_t *self, struct perf_sample *sample __used, return 0; } -int event__process(event_t *event, struct perf_sample *sample, - struct perf_session *session) +int perf_event__process(union perf_event *event, struct perf_sample *sample, + struct perf_session *session) { switch (event->header.type) { case PERF_RECORD_COMM: - event__process_comm(event, sample, session); + perf_event__process_comm(event, sample, session); break; case PERF_RECORD_MMAP: - event__process_mmap(event, sample, session); + perf_event__process_mmap(event, sample, session); break; case PERF_RECORD_FORK: case PERF_RECORD_EXIT: - event__process_task(event, sample, session); + perf_event__process_task(event, sample, session); break; case PERF_RECORD_LOST: - event__process_lost(event, sample, session); + perf_event__process_lost(event, sample, session); default: break; } @@ -756,12 +763,14 @@ static void dso__calc_col_width(struct dso *self, struct hists *hists) self->slen_calculated = 1; } -int event__preprocess_sample(const event_t *self, struct perf_session *session, - struct addr_location *al, struct perf_sample *sample, - symbol_filter_t filter) +int perf_event__preprocess_sample(const union perf_event *event, + struct perf_session *session, + struct addr_location *al, + struct perf_sample *sample, + symbol_filter_t filter) { - u8 cpumode = self->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; - struct thread *thread = perf_session__findnew(session, self->ip.pid); + u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; + struct thread *thread = perf_session__findnew(session, event->ip.pid); if (thread == NULL) return -1; @@ -783,7 +792,7 @@ int event__preprocess_sample(const event_t *self, struct perf_session *session, machine__create_kernel_maps(&session->host_machine); thread__find_addr_map(thread, session, cpumode, MAP__FUNCTION, - self->ip.pid, self->ip.ip, al); + event->ip.pid, event->ip.ip, al); dump_printf(" ...... dso: %s\n", al->map ? al->map->dso->long_name : al->level == 'H' ? "[hypervisor]" : ""); diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index 84fd71ffd619..eecb42273d59 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -117,7 +117,7 @@ struct tracing_data_event { u32 size; }; -typedef union event_union { +union perf_event { struct perf_event_header header; struct ip_event ip; struct mmap_event mmap; @@ -130,49 +130,52 @@ typedef union event_union { struct event_type_event event_type; struct tracing_data_event tracing_data; struct build_id_event build_id; -} event_t; +}; -void event__print_totals(void); +void perf_event__print_totals(void); struct perf_session; -typedef int (*event__handler_synth_t)(event_t *event, +typedef int (*perf_event__handler_synth_t)(union perf_event *event, + struct perf_session *session); +typedef int (*perf_event__handler_t)(union perf_event *event, + struct perf_sample *sample, struct perf_session *session); -typedef int (*event__handler_t)(event_t *event, struct perf_sample *sample, - struct perf_session *session); -int event__synthesize_thread(pid_t pid, event__handler_t process, +int perf_event__synthesize_thread(pid_t pid, perf_event__handler_t process, + struct perf_session *session); +int perf_event__synthesize_threads(perf_event__handler_t process, + struct perf_session *session); +int perf_event__synthesize_kernel_mmap(perf_event__handler_t process, + struct perf_session *session, + struct machine *machine, + const char *symbol_name); + +int perf_event__synthesize_modules(perf_event__handler_t process, + struct perf_session *session, + struct machine *machine); + +int perf_event__process_comm(union perf_event *event, struct perf_sample *sample, struct perf_session *session); -int event__synthesize_threads(event__handler_t process, - struct perf_session *session); -int event__synthesize_kernel_mmap(event__handler_t process, - struct perf_session *session, - struct machine *machine, - const char *symbol_name); - -int event__synthesize_modules(event__handler_t process, - struct perf_session *session, - struct machine *machine); - -int event__process_comm(event_t *event, struct perf_sample *sample, +int perf_event__process_lost(union perf_event *event, struct perf_sample *sample, + struct perf_session *session); +int perf_event__process_mmap(union perf_event *event, struct perf_sample *sample, + struct perf_session *session); +int perf_event__process_task(union perf_event *event, struct perf_sample *sample, + struct perf_session *session); +int perf_event__process(union perf_event *event, struct perf_sample *sample, struct perf_session *session); -int event__process_lost(event_t *event, struct perf_sample *sample, - struct perf_session *session); -int event__process_mmap(event_t *event, struct perf_sample *sample, - struct perf_session *session); -int event__process_task(event_t *event, struct perf_sample *sample, - struct perf_session *session); -int event__process(event_t *event, struct perf_sample *sample, - struct perf_session *session); struct addr_location; -int event__preprocess_sample(const event_t *self, struct perf_session *session, - struct addr_location *al, struct perf_sample *sample, - symbol_filter_t filter); +int perf_event__preprocess_sample(const union perf_event *self, + struct perf_session *session, + struct addr_location *al, + struct perf_sample *sample, + symbol_filter_t filter); -const char *event__get_event_name(unsigned int id); +const char *perf_event__name(unsigned int id); -int event__parse_sample(const event_t *event, u64 type, bool sample_id_all, - struct perf_sample *sample); +int perf_event__parse_sample(const union perf_event *event, u64 type, + bool sample_id_all, struct perf_sample *sample); #endif /* __PERF_RECORD_H */ diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index b498eecbe850..917fc18d0bed 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -107,7 +107,7 @@ struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id) return NULL; } -event_t *perf_evlist__read_on_cpu(struct perf_evlist *evlist, int cpu) +union perf_event *perf_evlist__read_on_cpu(struct perf_evlist *evlist, int cpu) { /* XXX Move this to perf.c, making it generally available */ unsigned int page_size = sysconf(_SC_PAGE_SIZE); @@ -115,7 +115,7 @@ event_t *perf_evlist__read_on_cpu(struct perf_evlist *evlist, int cpu) unsigned int head = perf_mmap__read_head(md); unsigned int old = md->prev; unsigned char *data = md->base + page_size; - event_t *event = NULL; + union perf_event *event = NULL; if (evlist->overwrite) { /* @@ -140,7 +140,7 @@ event_t *perf_evlist__read_on_cpu(struct perf_evlist *evlist, int cpu) if (old != head) { size_t size; - event = (event_t *)&data[old & md->mask]; + event = (union perf_event *)&data[old & md->mask]; size = event->header.size; /* diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 2706ae40c8b3..022ae404b908 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -17,7 +17,7 @@ struct perf_evlist { int nr_fds; int mmap_len; bool overwrite; - event_t event_copy; + union perf_event event_copy; struct perf_mmap *mmap; struct pollfd *pollfd; }; @@ -37,6 +37,6 @@ void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd); struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id); -event_t *perf_evlist__read_on_cpu(struct perf_evlist *self, int cpu); +union perf_event *perf_evlist__read_on_cpu(struct perf_evlist *self, int cpu); #endif /* __PERF_EVLIST_H */ diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index a13488511880..fddeb08f48a7 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -367,8 +367,8 @@ out_unmap: return -1; } -static int event__parse_id_sample(const event_t *event, u64 type, - struct perf_sample *sample) +static int perf_event__parse_id_sample(const union perf_event *event, u64 type, + struct perf_sample *sample) { const u64 *array = event->sample.array; @@ -405,8 +405,8 @@ static int event__parse_id_sample(const event_t *event, u64 type, return 0; } -int event__parse_sample(const event_t *event, u64 type, bool sample_id_all, - struct perf_sample *data) +int perf_event__parse_sample(const union perf_event *event, u64 type, + bool sample_id_all, struct perf_sample *data) { const u64 *array; @@ -416,7 +416,7 @@ int event__parse_sample(const event_t *event, u64 type, bool sample_id_all, if (event->header.type != PERF_RECORD_SAMPLE) { if (!sample_id_all) return 0; - return event__parse_id_sample(event, type, data); + return perf_event__parse_id_sample(event, type, data); } array = event->sample.array; diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index f0138d472339..c0de5ec44145 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1000,11 +1000,11 @@ perf_header__find_attr(u64 id, struct perf_header *header) return NULL; } -int event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id, - event__handler_t process, - struct perf_session *session) +int perf_event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id, + perf_event__handler_t process, + struct perf_session *session) { - event_t *ev; + union perf_event *ev; size_t size; int err; @@ -1031,8 +1031,9 @@ int event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id, return err; } -int event__synthesize_attrs(struct perf_header *self, event__handler_t process, - struct perf_session *session) +int perf_event__synthesize_attrs(struct perf_header *self, + perf_event__handler_t process, + struct perf_session *session) { struct perf_header_attr *attr; int i, err = 0; @@ -1040,8 +1041,8 @@ int event__synthesize_attrs(struct perf_header *self, event__handler_t process, for (i = 0; i < self->attrs; i++) { attr = self->attr[i]; - err = event__synthesize_attr(&attr->attr, attr->ids, attr->id, - process, session); + err = perf_event__synthesize_attr(&attr->attr, attr->ids, + attr->id, process, session); if (err) { pr_debug("failed to create perf header attribute\n"); return err; @@ -1051,21 +1052,22 @@ int event__synthesize_attrs(struct perf_header *self, event__handler_t process, return err; } -int event__process_attr(event_t *self, struct perf_session *session) +int perf_event__process_attr(union perf_event *event, + struct perf_session *session) { struct perf_header_attr *attr; unsigned int i, ids, n_ids; - attr = perf_header_attr__new(&self->attr.attr); + attr = perf_header_attr__new(&event->attr.attr); if (attr == NULL) return -ENOMEM; - ids = self->header.size; - ids -= (void *)&self->attr.id - (void *)self; + ids = event->header.size; + ids -= (void *)&event->attr.id - (void *)event; n_ids = ids / sizeof(u64); for (i = 0; i < n_ids; i++) { - if (perf_header_attr__add_id(attr, self->attr.id[i]) < 0) { + if (perf_header_attr__add_id(attr, event->attr.id[i]) < 0) { perf_header_attr__delete(attr); return -ENOMEM; } @@ -1081,11 +1083,11 @@ int event__process_attr(event_t *self, struct perf_session *session) return 0; } -int event__synthesize_event_type(u64 event_id, char *name, - event__handler_t process, - struct perf_session *session) +int perf_event__synthesize_event_type(u64 event_id, char *name, + perf_event__handler_t process, + struct perf_session *session) { - event_t ev; + union perf_event ev; size_t size = 0; int err = 0; @@ -1106,8 +1108,8 @@ int event__synthesize_event_type(u64 event_id, char *name, return err; } -int event__synthesize_event_types(event__handler_t process, - struct perf_session *session) +int perf_event__synthesize_event_types(perf_event__handler_t process, + struct perf_session *session) { struct perf_trace_event_type *type; int i, err = 0; @@ -1115,8 +1117,9 @@ int event__synthesize_event_types(event__handler_t process, for (i = 0; i < event_count; i++) { type = &events[i]; - err = event__synthesize_event_type(type->event_id, type->name, - process, session); + err = perf_event__synthesize_event_type(type->event_id, + type->name, process, + session); if (err) { pr_debug("failed to create perf header event type\n"); return err; @@ -1126,21 +1129,21 @@ int event__synthesize_event_types(event__handler_t process, return err; } -int event__process_event_type(event_t *self, - struct perf_session *session __unused) +int perf_event__process_event_type(union perf_event *event, + struct perf_session *session __unused) { - if (perf_header__push_event(self->event_type.event_type.event_id, - self->event_type.event_type.name) < 0) + if (perf_header__push_event(event->event_type.event_type.event_id, + event->event_type.event_type.name) < 0) return -ENOMEM; return 0; } -int event__synthesize_tracing_data(int fd, struct perf_evlist *evlist, - event__handler_t process, +int perf_event__synthesize_tracing_data(int fd, struct perf_evlist *evlist, + perf_event__handler_t process, struct perf_session *session __unused) { - event_t ev; + union perf_event ev; ssize_t size = 0, aligned_size = 0, padding; int err = 0; @@ -1163,10 +1166,10 @@ int event__synthesize_tracing_data(int fd, struct perf_evlist *evlist, return aligned_size; } -int event__process_tracing_data(event_t *self, - struct perf_session *session) +int perf_event__process_tracing_data(union perf_event *event, + struct perf_session *session) { - ssize_t size_read, padding, size = self->tracing_data.size; + ssize_t size_read, padding, size = event->tracing_data.size; off_t offset = lseek(session->fd, 0, SEEK_CUR); char buf[BUFSIZ]; @@ -1192,12 +1195,12 @@ int event__process_tracing_data(event_t *self, return size_read + padding; } -int event__synthesize_build_id(struct dso *pos, u16 misc, - event__handler_t process, - struct machine *machine, - struct perf_session *session) +int perf_event__synthesize_build_id(struct dso *pos, u16 misc, + perf_event__handler_t process, + struct machine *machine, + struct perf_session *session) { - event_t ev; + union perf_event ev; size_t len; int err = 0; @@ -1220,11 +1223,11 @@ int event__synthesize_build_id(struct dso *pos, u16 misc, return err; } -int event__process_build_id(event_t *self, - struct perf_session *session) +int perf_event__process_build_id(union perf_event *event, + struct perf_session *session) { - __event_process_build_id(&self->build_id, - self->build_id.filename, + __event_process_build_id(&event->build_id, + event->build_id.filename, session); return 0; } diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index 65afd7f74e0d..f042cebcec1e 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -100,32 +100,32 @@ int build_id_cache__add_s(const char *sbuild_id, const char *debugdir, const char *name, bool is_kallsyms); int build_id_cache__remove_s(const char *sbuild_id, const char *debugdir); -int event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id, - event__handler_t process, - struct perf_session *session); -int event__synthesize_attrs(struct perf_header *self, - event__handler_t process, - struct perf_session *session); -int event__process_attr(event_t *self, struct perf_session *session); - -int event__synthesize_event_type(u64 event_id, char *name, - event__handler_t process, - struct perf_session *session); -int event__synthesize_event_types(event__handler_t process, - struct perf_session *session); -int event__process_event_type(event_t *self, - struct perf_session *session); - -int event__synthesize_tracing_data(int fd, struct perf_evlist *evlist, - event__handler_t process, - struct perf_session *session); -int event__process_tracing_data(event_t *self, +int perf_event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id, + perf_event__handler_t process, struct perf_session *session); +int perf_event__synthesize_attrs(struct perf_header *self, + perf_event__handler_t process, + struct perf_session *session); +int perf_event__process_attr(union perf_event *event, struct perf_session *session); -int event__synthesize_build_id(struct dso *pos, u16 misc, - event__handler_t process, - struct machine *machine, - struct perf_session *session); -int event__process_build_id(event_t *self, struct perf_session *session); +int perf_event__synthesize_event_type(u64 event_id, char *name, + perf_event__handler_t process, + struct perf_session *session); +int perf_event__synthesize_event_types(perf_event__handler_t process, + struct perf_session *session); +int perf_event__process_event_type(union perf_event *event, + struct perf_session *session); +int perf_event__synthesize_tracing_data(int fd, struct perf_evlist *evlist, + perf_event__handler_t process, + struct perf_session *session); +int perf_event__process_tracing_data(union perf_event *event, + struct perf_session *session); + +int perf_event__synthesize_build_id(struct dso *pos, u16 misc, + perf_event__handler_t process, + struct machine *machine, + struct perf_session *session); +int perf_event__process_build_id(union perf_event *event, + struct perf_session *session); #endif /* __PERF_HEADER_H */ diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 02ed318d7312..95887804dc8e 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -1182,7 +1182,7 @@ size_t hists__fprintf_nr_events(struct hists *self, FILE *fp) size_t ret = 0; for (i = 0; i < PERF_RECORD_HEADER_MAX; ++i) { - const char *name = event__get_event_name(i); + const char *name = perf_event__name(i); if (!strcmp(name, "UNKNOWN")) continue; diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index ee0b61102571..a3a871f7bda3 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -165,7 +165,7 @@ struct perf_session *perf_session__new(const char *filename, int mode, } else if (mode == O_WRONLY) { /* * In O_RDONLY mode this will be performed when reading the - * kernel MMAP event, in event__process_mmap(). + * kernel MMAP event, in perf_event__process_mmap(). */ if (perf_session__create_kernel_maps(self) < 0) goto out_delete; @@ -291,14 +291,14 @@ int perf_session__resolve_callchain(struct perf_session *self, return 0; } -static int process_event_synth_stub(event_t *event __used, +static int process_event_synth_stub(union perf_event *event __used, struct perf_session *session __used) { dump_printf(": unhandled!\n"); return 0; } -static int process_event_stub(event_t *event __used, +static int process_event_stub(union perf_event *event __used, struct perf_sample *sample __used, struct perf_session *session __used) { @@ -306,7 +306,7 @@ static int process_event_stub(event_t *event __used, return 0; } -static int process_finished_round_stub(event_t *event __used, +static int process_finished_round_stub(union perf_event *event __used, struct perf_session *session __used, struct perf_event_ops *ops __used) { @@ -314,7 +314,7 @@ static int process_finished_round_stub(event_t *event __used, return 0; } -static int process_finished_round(event_t *event, +static int process_finished_round(union perf_event *event, struct perf_session *session, struct perf_event_ops *ops); @@ -331,7 +331,7 @@ static void perf_event_ops__fill_defaults(struct perf_event_ops *handler) if (handler->exit == NULL) handler->exit = process_event_stub; if (handler->lost == NULL) - handler->lost = event__process_lost; + handler->lost = perf_event__process_lost; if (handler->read == NULL) handler->read = process_event_stub; if (handler->throttle == NULL) @@ -365,98 +365,98 @@ void mem_bswap_64(void *src, int byte_size) } } -static void event__all64_swap(event_t *self) +static void perf_event__all64_swap(union perf_event *event) { - struct perf_event_header *hdr = &self->header; - mem_bswap_64(hdr + 1, self->header.size - sizeof(*hdr)); + struct perf_event_header *hdr = &event->header; + mem_bswap_64(hdr + 1, event->header.size - sizeof(*hdr)); } -static void event__comm_swap(event_t *self) +static void perf_event__comm_swap(union perf_event *event) { - self->comm.pid = bswap_32(self->comm.pid); - self->comm.tid = bswap_32(self->comm.tid); + event->comm.pid = bswap_32(event->comm.pid); + event->comm.tid = bswap_32(event->comm.tid); } -static void event__mmap_swap(event_t *self) +static void perf_event__mmap_swap(union perf_event *event) { - self->mmap.pid = bswap_32(self->mmap.pid); - self->mmap.tid = bswap_32(self->mmap.tid); - self->mmap.start = bswap_64(self->mmap.start); - self->mmap.len = bswap_64(self->mmap.len); - self->mmap.pgoff = bswap_64(self->mmap.pgoff); + event->mmap.pid = bswap_32(event->mmap.pid); + event->mmap.tid = bswap_32(event->mmap.tid); + event->mmap.start = bswap_64(event->mmap.start); + event->mmap.len = bswap_64(event->mmap.len); + event->mmap.pgoff = bswap_64(event->mmap.pgoff); } -static void event__task_swap(event_t *self) +static void perf_event__task_swap(union perf_event *event) { - self->fork.pid = bswap_32(self->fork.pid); - self->fork.tid = bswap_32(self->fork.tid); - self->fork.ppid = bswap_32(self->fork.ppid); - self->fork.ptid = bswap_32(self->fork.ptid); - self->fork.time = bswap_64(self->fork.time); + event->fork.pid = bswap_32(event->fork.pid); + event->fork.tid = bswap_32(event->fork.tid); + event->fork.ppid = bswap_32(event->fork.ppid); + event->fork.ptid = bswap_32(event->fork.ptid); + event->fork.time = bswap_64(event->fork.time); } -static void event__read_swap(event_t *self) +static void perf_event__read_swap(union perf_event *event) { - self->read.pid = bswap_32(self->read.pid); - self->read.tid = bswap_32(self->read.tid); - self->read.value = bswap_64(self->read.value); - self->read.time_enabled = bswap_64(self->read.time_enabled); - self->read.time_running = bswap_64(self->read.time_running); - self->read.id = bswap_64(self->read.id); + event->read.pid = bswap_32(event->read.pid); + event->read.tid = bswap_32(event->read.tid); + event->read.value = bswap_64(event->read.value); + event->read.time_enabled = bswap_64(event->read.time_enabled); + event->read.time_running = bswap_64(event->read.time_running); + event->read.id = bswap_64(event->read.id); } -static void event__attr_swap(event_t *self) +static void perf_event__attr_swap(union perf_event *event) { size_t size; - self->attr.attr.type = bswap_32(self->attr.attr.type); - self->attr.attr.size = bswap_32(self->attr.attr.size); - self->attr.attr.config = bswap_64(self->attr.attr.config); - self->attr.attr.sample_period = bswap_64(self->attr.attr.sample_period); - self->attr.attr.sample_type = bswap_64(self->attr.attr.sample_type); - self->attr.attr.read_format = bswap_64(self->attr.attr.read_format); - self->attr.attr.wakeup_events = bswap_32(self->attr.attr.wakeup_events); - self->attr.attr.bp_type = bswap_32(self->attr.attr.bp_type); - self->attr.attr.bp_addr = bswap_64(self->attr.attr.bp_addr); - self->attr.attr.bp_len = bswap_64(self->attr.attr.bp_len); + event->attr.attr.type = bswap_32(event->attr.attr.type); + event->attr.attr.size = bswap_32(event->attr.attr.size); + event->attr.attr.config = bswap_64(event->attr.attr.config); + event->attr.attr.sample_period = bswap_64(event->attr.attr.sample_period); + event->attr.attr.sample_type = bswap_64(event->attr.attr.sample_type); + event->attr.attr.read_format = bswap_64(event->attr.attr.read_format); + event->attr.attr.wakeup_events = bswap_32(event->attr.attr.wakeup_events); + event->attr.attr.bp_type = bswap_32(event->attr.attr.bp_type); + event->attr.attr.bp_addr = bswap_64(event->attr.attr.bp_addr); + event->attr.attr.bp_len = bswap_64(event->attr.attr.bp_len); - size = self->header.size; - size -= (void *)&self->attr.id - (void *)self; - mem_bswap_64(self->attr.id, size); + size = event->header.size; + size -= (void *)&event->attr.id - (void *)event; + mem_bswap_64(event->attr.id, size); } -static void event__event_type_swap(event_t *self) +static void perf_event__event_type_swap(union perf_event *event) { - self->event_type.event_type.event_id = - bswap_64(self->event_type.event_type.event_id); + event->event_type.event_type.event_id = + bswap_64(event->event_type.event_type.event_id); } -static void event__tracing_data_swap(event_t *self) +static void perf_event__tracing_data_swap(union perf_event *event) { - self->tracing_data.size = bswap_32(self->tracing_data.size); + event->tracing_data.size = bswap_32(event->tracing_data.size); } -typedef void (*event__swap_op)(event_t *self); +typedef void (*perf_event__swap_op)(union perf_event *event); -static event__swap_op event__swap_ops[] = { - [PERF_RECORD_MMAP] = event__mmap_swap, - [PERF_RECORD_COMM] = event__comm_swap, - [PERF_RECORD_FORK] = event__task_swap, - [PERF_RECORD_EXIT] = event__task_swap, - [PERF_RECORD_LOST] = event__all64_swap, - [PERF_RECORD_READ] = event__read_swap, - [PERF_RECORD_SAMPLE] = event__all64_swap, - [PERF_RECORD_HEADER_ATTR] = event__attr_swap, - [PERF_RECORD_HEADER_EVENT_TYPE] = event__event_type_swap, - [PERF_RECORD_HEADER_TRACING_DATA] = event__tracing_data_swap, - [PERF_RECORD_HEADER_BUILD_ID] = NULL, - [PERF_RECORD_HEADER_MAX] = NULL, +static perf_event__swap_op perf_event__swap_ops[] = { + [PERF_RECORD_MMAP] = perf_event__mmap_swap, + [PERF_RECORD_COMM] = perf_event__comm_swap, + [PERF_RECORD_FORK] = perf_event__task_swap, + [PERF_RECORD_EXIT] = perf_event__task_swap, + [PERF_RECORD_LOST] = perf_event__all64_swap, + [PERF_RECORD_READ] = perf_event__read_swap, + [PERF_RECORD_SAMPLE] = perf_event__all64_swap, + [PERF_RECORD_HEADER_ATTR] = perf_event__attr_swap, + [PERF_RECORD_HEADER_EVENT_TYPE] = perf_event__event_type_swap, + [PERF_RECORD_HEADER_TRACING_DATA] = perf_event__tracing_data_swap, + [PERF_RECORD_HEADER_BUILD_ID] = NULL, + [PERF_RECORD_HEADER_MAX] = NULL, }; struct sample_queue { u64 timestamp; u64 file_offset; - event_t *event; + union perf_event *event; struct list_head list; }; @@ -474,7 +474,7 @@ static void perf_session_free_sample_buffers(struct perf_session *session) } static int perf_session_deliver_event(struct perf_session *session, - event_t *event, + union perf_event *event, struct perf_sample *sample, struct perf_event_ops *ops, u64 file_offset); @@ -552,7 +552,7 @@ static void flush_sample_queue(struct perf_session *s, * Flush every events below timestamp 7 * etc... */ -static int process_finished_round(event_t *event __used, +static int process_finished_round(union perf_event *event __used, struct perf_session *session, struct perf_event_ops *ops) { @@ -609,7 +609,7 @@ static void __queue_event(struct sample_queue *new, struct perf_session *s) #define MAX_SAMPLE_BUFFER (64 * 1024 / sizeof(struct sample_queue)) -static int perf_session_queue_event(struct perf_session *s, event_t *event, +static int perf_session_queue_event(struct perf_session *s, union perf_event *event, struct perf_sample *sample, u64 file_offset) { struct ordered_samples *os = &s->ordered_samples; @@ -662,7 +662,7 @@ static void callchain__printf(struct perf_sample *sample) } static void perf_session__print_tstamp(struct perf_session *session, - event_t *event, + union perf_event *event, struct perf_sample *sample) { if (event->header.type != PERF_RECORD_SAMPLE && @@ -678,7 +678,7 @@ static void perf_session__print_tstamp(struct perf_session *session, printf("%" PRIu64 " ", sample->time); } -static void dump_event(struct perf_session *session, event_t *event, +static void dump_event(struct perf_session *session, union perf_event *event, u64 file_offset, struct perf_sample *sample) { if (!dump_trace) @@ -693,10 +693,10 @@ static void dump_event(struct perf_session *session, event_t *event, perf_session__print_tstamp(session, event, sample); printf("%#" PRIx64 " [%#x]: PERF_RECORD_%s", file_offset, - event->header.size, event__get_event_name(event->header.type)); + event->header.size, perf_event__name(event->header.type)); } -static void dump_sample(struct perf_session *session, event_t *event, +static void dump_sample(struct perf_session *session, union perf_event *event, struct perf_sample *sample) { if (!dump_trace) @@ -711,7 +711,7 @@ static void dump_sample(struct perf_session *session, event_t *event, } static int perf_session_deliver_event(struct perf_session *session, - event_t *event, + union perf_event *event, struct perf_sample *sample, struct perf_event_ops *ops, u64 file_offset) @@ -745,7 +745,7 @@ static int perf_session_deliver_event(struct perf_session *session, } static int perf_session__preprocess_sample(struct perf_session *session, - event_t *event, struct perf_sample *sample) + union perf_event *event, struct perf_sample *sample) { if (event->header.type != PERF_RECORD_SAMPLE || !(session->sample_type & PERF_SAMPLE_CALLCHAIN)) @@ -760,7 +760,7 @@ static int perf_session__preprocess_sample(struct perf_session *session, return 0; } -static int perf_session__process_user_event(struct perf_session *session, event_t *event, +static int perf_session__process_user_event(struct perf_session *session, union perf_event *event, struct perf_event_ops *ops, u64 file_offset) { dump_event(session, event, file_offset, NULL); @@ -785,15 +785,16 @@ static int perf_session__process_user_event(struct perf_session *session, event_ } static int perf_session__process_event(struct perf_session *session, - event_t *event, + union perf_event *event, struct perf_event_ops *ops, u64 file_offset) { struct perf_sample sample; int ret; - if (session->header.needs_swap && event__swap_ops[event->header.type]) - event__swap_ops[event->header.type](event); + if (session->header.needs_swap && + perf_event__swap_ops[event->header.type]) + perf_event__swap_ops[event->header.type](event); if (event->header.type >= PERF_RECORD_HEADER_MAX) return -EINVAL; @@ -845,7 +846,7 @@ static struct thread *perf_session__register_idle_thread(struct perf_session *se static void perf_session__warn_about_errors(const struct perf_session *session, const struct perf_event_ops *ops) { - if (ops->lost == event__process_lost && + if (ops->lost == perf_event__process_lost && session->hists.stats.total_lost != 0) { ui__warning("Processed %" PRIu64 " events and LOST %" PRIu64 "!\n\nCheck IO/CPU overload!\n\n", @@ -877,7 +878,7 @@ volatile int session_done; static int __perf_session__process_pipe_events(struct perf_session *self, struct perf_event_ops *ops) { - event_t event; + union perf_event event; uint32_t size; int skip = 0; u64 head; @@ -958,7 +959,7 @@ int __perf_session__process_events(struct perf_session *session, struct ui_progress *progress; size_t page_size, mmap_size; char *buf, *mmaps[8]; - event_t *event; + union perf_event *event; uint32_t size; perf_event_ops__fill_defaults(ops); @@ -1003,7 +1004,7 @@ remap: file_pos = file_offset + head; more: - event = (event_t *)(buf + head); + event = (union perf_event *)(buf + head); if (session->header.needs_swap) perf_event_header__bswap(&event->header); diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index 365bf533a396..977b3a1b14aa 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -57,10 +57,11 @@ struct perf_session { struct perf_event_ops; -typedef int (*event_op)(event_t *self, struct perf_sample *sample, +typedef int (*event_op)(union perf_event *self, struct perf_sample *sample, struct perf_session *session); -typedef int (*event_synth_op)(event_t *self, struct perf_session *session); -typedef int (*event_op2)(event_t *self, struct perf_session *session, +typedef int (*event_synth_op)(union perf_event *self, + struct perf_session *session); +typedef int (*event_op2)(union perf_event *self, struct perf_session *session, struct perf_event_ops *ops); struct perf_event_ops { @@ -157,11 +158,11 @@ size_t perf_session__fprintf_nr_events(struct perf_session *self, FILE *fp) } static inline int perf_session__parse_sample(struct perf_session *session, - const event_t *event, + const union perf_event *event, struct perf_sample *sample) { - return event__parse_sample(event, session->sample_type, - session->sample_id_all, sample); + return perf_event__parse_sample(event, session->sample_type, + session->sample_id_all, sample); } #endif /* __PERF_SESSION_H */ From 877108e42b1b9ba64857c4030cf356ecc120fd18 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 29 Jan 2011 15:44:29 -0200 Subject: [PATCH 098/706] perf tools: Initial python binding First clarifying that this kind of binding is not a replacement or an equivalent to the 'perf script' way of using python with perf. The 'perf script' way is to process events and look at a given script for some python function that matches the events to pass each event for processing. This is a python module, i.e. everything is driven from the python script, that merely uses "import perf" or "from perf import". perf script is focused on tracepoints, this binding is focused on profiling as an initial target. More work is needed to make available tracepoint specific variables as event variables accessible via this binding. There is one example of such usage model, in tools/perf/python/twatch.py, a tool to watch "cycles" events together with task (fork, exit) and comm perf events. For now, due to me not being able to grok how python distutils cope with building C extensions outside the sources dir the install target just builds it, I'm using it as: [root@emilia linux]# export PYTHONPATH=~acme/git/build/perf/lib.linux-x86_64-2.6/ [root@emilia linux]# tools/perf/python/twatch.py cpu: 4, pid: 30126, tid: 30126 { type: mmap, pid: 30126, tid: 30126, start: 0x4, length: 0x82e9ca03, offset: 0, filename: } cpu: 6, pid: 47, tid: 47 { type: mmap, pid: 47, tid: 47, start: 0x6, length: 0xbef87c36, offset: 0, filename: } cpu: 1, pid: 0, tid: 0 { type: mmap, pid: 0, tid: 0, start: 0x1, length: 0x775d1904, offset: 0, filename: } cpu: 7, pid: 0, tid: 0 { type: mmap, pid: 0, tid: 0, start: 0x7, length: 0xc750aeb6, offset: 0, filename: } cpu: 5, pid: 2255, tid: 2255 { type: mmap, pid: 2255, tid: 2255, start: 0x5, length: 0x76669635, offset: 0, filename: } cpu: 0, pid: 0, tid: 0 { type: mmap, pid: 0, tid: 0, start: 0, length: 0x6422ef6b, offset: 0, filename: } cpu: 2, pid: 2255, tid: 2255 { type: mmap, pid: 2255, tid: 2255, start: 0x2, length: 0xe078757a, offset: 0, filename: } cpu: 1, pid: 5769, tid: 5769 { type: fork, pid: 30127, ppid: 5769, tid: 30127, ptid: 5769, time: 103893991270534} cpu: 6, pid: 30127, tid: 30127 { type: comm, pid: 30127, tid: 30127, comm: ls } cpu: 6, pid: 30127, tid: 30127 { type: exit, pid: 30127, ppid: 30127, tid: 30127, ptid: 30127, time: 103893993273024} The first 8 mmap events in this 8 way machine are a mistery that is still being investigated. More of the tools/perf/util/ APIs will be exposed via this python binding as the need arises. For now the focus is on creating events and processing them, symbol resolution is an obvious next step, with tracepoint variables as a close second step. Cc: Clark Williams Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile | 20 +- tools/perf/python/twatch.py | 41 ++ tools/perf/util/python.c | 888 ++++++++++++++++++++++++++++++++++++ tools/perf/util/setup.py | 18 + 4 files changed, 966 insertions(+), 1 deletion(-) create mode 100755 tools/perf/python/twatch.py create mode 100644 tools/perf/util/python.c create mode 100644 tools/perf/util/setup.py diff --git a/tools/perf/Makefile b/tools/perf/Makefile index eedcf9541572..36ff73c0af94 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -315,6 +315,7 @@ COMPAT_CFLAGS = COMPAT_OBJS = LIB_H = LIB_OBJS = +PYRF_OBJS = SCRIPT_PERL = SCRIPT_SH = TEST_PROGRAMS = @@ -324,6 +325,9 @@ SCRIPT_SH += perf-archive.sh grep-libs = $(filter -l%,$(1)) strip-libs = $(filter-out -l%,$(1)) +pyrf: $(PYRF_OBJS) + python util/setup.py build --build-base='$(OUTPUT)' + # # No Perl scripts right now: # @@ -349,7 +353,7 @@ PROGRAMS += $(OUTPUT)perf # # what 'all' will build and 'install' will install, in perfexecdir -ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS) +ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS) pyrf # what 'all' will build but not install in perfexecdir OTHER_PROGRAMS = $(OUTPUT)perf$X @@ -520,6 +524,20 @@ BUILTIN_OBJS += $(OUTPUT)builtin-inject.o PERFLIBS = $(LIB_FILE) +# Files needed for the python binding, perf.so +# pyrf is just an internal name needed for all those wrappers. +# This has to be in sync with what is in the 'sources' variable in +# tools/perf/util/setup.py + +PYRF_OBJS += $(OUTPUT)util/cpumap.o +PYRF_OBJS += $(OUTPUT)util/ctype.o +PYRF_OBJS += $(OUTPUT)util/evlist.o +PYRF_OBJS += $(OUTPUT)util/evsel.o +PYRF_OBJS += $(OUTPUT)util/python.o +PYRF_OBJS += $(OUTPUT)util/thread_map.o +PYRF_OBJS += $(OUTPUT)util/util.o +PYRF_OBJS += $(OUTPUT)util/xyarray.o + # # Platform specific tweaks # diff --git a/tools/perf/python/twatch.py b/tools/perf/python/twatch.py new file mode 100755 index 000000000000..5e9f3b7b7ee8 --- /dev/null +++ b/tools/perf/python/twatch.py @@ -0,0 +1,41 @@ +#! /usr/bin/python +# -*- python -*- +# -*- coding: utf-8 -*- +# twatch - Experimental use of the perf python interface +# Copyright (C) 2011 Arnaldo Carvalho de Melo +# +# This application is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; version 2. +# +# This application is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. + +import perf + +def main(): + cpus = perf.cpu_map() + threads = perf.thread_map() + evsel = perf.evsel(task = 1, comm = 1, mmap = 0, + wakeup_events = 1, sample_period = 1, + sample_id_all = 1, + sample_type = perf.SAMPLE_PERIOD | perf.SAMPLE_TID | perf.SAMPLE_CPU | perf.SAMPLE_TID) + evsel.open(cpus = cpus, threads = threads); + evlist = perf.evlist() + evlist.add(evsel) + evlist.mmap(cpus = cpus, threads = threads) + while True: + evlist.poll(timeout = -1) + for cpu in cpus: + event = evlist.read_on_cpu(cpu) + if not event: + continue + print "cpu: %2d, pid: %4d, tid: %4d" % (event.sample_cpu, + event.sample_pid, + event.sample_tid), + print event + +if __name__ == '__main__': + main() diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c new file mode 100644 index 000000000000..88d47895a79c --- /dev/null +++ b/tools/perf/util/python.c @@ -0,0 +1,888 @@ +#include +#include +#include +#include +#include "evlist.h" +#include "evsel.h" +#include "event.h" +#include "cpumap.h" +#include "thread_map.h" + +struct throttle_event { + struct perf_event_header header; + u64 time; + u64 id; + u64 stream_id; +}; + +#define member_def(type, member, ptype, help) \ + { #member, ptype, \ + offsetof(struct pyrf_event, event) + offsetof(struct type, member), \ + 0, help } + +#define sample_member_def(name, member, ptype, help) \ + { #name, ptype, \ + offsetof(struct pyrf_event, sample) + offsetof(struct perf_sample, member), \ + 0, help } + +struct pyrf_event { + PyObject_HEAD + struct perf_sample sample; + union perf_event event; +}; + +#define T_ULONG_LONG T_ULONG + +#define sample_members \ + sample_member_def(sample_ip, ip, T_ULONG_LONG, "event type"), \ + sample_member_def(sample_pid, pid, T_INT, "event pid"), \ + sample_member_def(sample_tid, tid, T_INT, "event tid"), \ + sample_member_def(sample_time, time, T_ULONG_LONG, "event timestamp"), \ + sample_member_def(sample_addr, addr, T_ULONG_LONG, "event addr"), \ + sample_member_def(sample_id, id, T_ULONG_LONG, "event id"), \ + sample_member_def(sample_stream_id, stream_id, T_ULONG_LONG, "event stream id"), \ + sample_member_def(sample_period, period, T_ULONG_LONG, "event period"), \ + sample_member_def(sample_cpu, cpu, T_UINT, "event cpu"), + +static char pyrf_mmap_event__doc[] = PyDoc_STR("perf mmap event object."); + +static PyMemberDef pyrf_mmap_event__members[] = { + sample_members + member_def(perf_event_header, type, T_UINT, "event type"), + member_def(mmap_event, pid, T_UINT, "event pid"), + member_def(mmap_event, tid, T_UINT, "event tid"), + member_def(mmap_event, start, T_ULONG_LONG, "start of the map"), + member_def(mmap_event, len, T_ULONG_LONG, "map length"), + member_def(mmap_event, pgoff, T_ULONG_LONG, "page offset"), + member_def(mmap_event, filename, T_STRING_INPLACE, "backing store"), + { NULL, }, +}; + +static PyObject *pyrf_mmap_event__repr(struct pyrf_event *pevent) +{ + PyObject *ret; + char *s; + + if (asprintf(&s, "{ type: mmap, pid: %u, tid: %u, start: %#" PRIx64 ", " + "length: %#" PRIx64 ", offset: %#" PRIx64 ", " + "filename: %s }", + pevent->event.mmap.pid, pevent->event.mmap.tid, + pevent->event.mmap.start, pevent->event.mmap.len, + pevent->event.mmap.pgoff, pevent->event.mmap.filename) < 0) { + ret = PyErr_NoMemory(); + } else { + ret = PyString_FromString(s); + free(s); + } + return ret; +} + +static PyTypeObject pyrf_mmap_event__type = { + PyVarObject_HEAD_INIT(NULL, 0) + .tp_name = "perf.mmap_event", + .tp_basicsize = sizeof(struct pyrf_event), + .tp_flags = Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, + .tp_doc = pyrf_mmap_event__doc, + .tp_members = pyrf_mmap_event__members, + .tp_repr = (reprfunc)pyrf_mmap_event__repr, +}; + +static char pyrf_task_event__doc[] = PyDoc_STR("perf task (fork/exit) event object."); + +static PyMemberDef pyrf_task_event__members[] = { + sample_members + member_def(perf_event_header, type, T_UINT, "event type"), + member_def(fork_event, pid, T_UINT, "event pid"), + member_def(fork_event, ppid, T_UINT, "event ppid"), + member_def(fork_event, tid, T_UINT, "event tid"), + member_def(fork_event, ptid, T_UINT, "event ptid"), + member_def(fork_event, time, T_ULONG_LONG, "timestamp"), + { NULL, }, +}; + +static PyObject *pyrf_task_event__repr(struct pyrf_event *pevent) +{ + return PyString_FromFormat("{ type: %s, pid: %u, ppid: %u, tid: %u, " + "ptid: %u, time: %" PRIu64 "}", + pevent->event.header.type == PERF_RECORD_FORK ? "fork" : "exit", + pevent->event.fork.pid, + pevent->event.fork.ppid, + pevent->event.fork.tid, + pevent->event.fork.ptid, + pevent->event.fork.time); +} + +static PyTypeObject pyrf_task_event__type = { + PyVarObject_HEAD_INIT(NULL, 0) + .tp_name = "perf.task_event", + .tp_basicsize = sizeof(struct pyrf_event), + .tp_flags = Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, + .tp_doc = pyrf_task_event__doc, + .tp_members = pyrf_task_event__members, + .tp_repr = (reprfunc)pyrf_task_event__repr, +}; + +static char pyrf_comm_event__doc[] = PyDoc_STR("perf comm event object."); + +static PyMemberDef pyrf_comm_event__members[] = { + sample_members + member_def(perf_event_header, type, T_UINT, "event type"), + member_def(comm_event, pid, T_UINT, "event pid"), + member_def(comm_event, tid, T_UINT, "event tid"), + member_def(comm_event, comm, T_STRING_INPLACE, "process name"), + { NULL, }, +}; + +static PyObject *pyrf_comm_event__repr(struct pyrf_event *pevent) +{ + return PyString_FromFormat("{ type: comm, pid: %u, tid: %u, comm: %s }", + pevent->event.comm.pid, + pevent->event.comm.tid, + pevent->event.comm.comm); +} + +static PyTypeObject pyrf_comm_event__type = { + PyVarObject_HEAD_INIT(NULL, 0) + .tp_name = "perf.comm_event", + .tp_basicsize = sizeof(struct pyrf_event), + .tp_flags = Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, + .tp_doc = pyrf_comm_event__doc, + .tp_members = pyrf_comm_event__members, + .tp_repr = (reprfunc)pyrf_comm_event__repr, +}; + +static char pyrf_throttle_event__doc[] = PyDoc_STR("perf throttle event object."); + +static PyMemberDef pyrf_throttle_event__members[] = { + sample_members + member_def(perf_event_header, type, T_UINT, "event type"), + member_def(throttle_event, time, T_ULONG_LONG, "timestamp"), + member_def(throttle_event, id, T_ULONG_LONG, "event id"), + member_def(throttle_event, stream_id, T_ULONG_LONG, "event stream id"), + { NULL, }, +}; + +static PyObject *pyrf_throttle_event__repr(struct pyrf_event *pevent) +{ + struct throttle_event *te = (struct throttle_event *)(&pevent->event.header + 1); + + return PyString_FromFormat("{ type: %sthrottle, time: %" PRIu64 ", id: %" PRIu64 + ", stream_id: %" PRIu64 " }", + pevent->event.header.type == PERF_RECORD_THROTTLE ? "" : "un", + te->time, te->id, te->stream_id); +} + +static PyTypeObject pyrf_throttle_event__type = { + PyVarObject_HEAD_INIT(NULL, 0) + .tp_name = "perf.throttle_event", + .tp_basicsize = sizeof(struct pyrf_event), + .tp_flags = Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, + .tp_doc = pyrf_throttle_event__doc, + .tp_members = pyrf_throttle_event__members, + .tp_repr = (reprfunc)pyrf_throttle_event__repr, +}; + +static int pyrf_event__setup_types(void) +{ + int err; + pyrf_mmap_event__type.tp_new = + pyrf_task_event__type.tp_new = + pyrf_comm_event__type.tp_new = + pyrf_throttle_event__type.tp_new = PyType_GenericNew; + err = PyType_Ready(&pyrf_mmap_event__type); + if (err < 0) + goto out; + err = PyType_Ready(&pyrf_task_event__type); + if (err < 0) + goto out; + err = PyType_Ready(&pyrf_comm_event__type); + if (err < 0) + goto out; + err = PyType_Ready(&pyrf_throttle_event__type); + if (err < 0) + goto out; +out: + return err; +} + +static PyTypeObject *pyrf_event__type[] = { + [PERF_RECORD_MMAP] = &pyrf_mmap_event__type, + [PERF_RECORD_LOST] = &pyrf_mmap_event__type, + [PERF_RECORD_COMM] = &pyrf_comm_event__type, + [PERF_RECORD_EXIT] = &pyrf_task_event__type, + [PERF_RECORD_THROTTLE] = &pyrf_throttle_event__type, + [PERF_RECORD_UNTHROTTLE] = &pyrf_throttle_event__type, + [PERF_RECORD_FORK] = &pyrf_task_event__type, + [PERF_RECORD_READ] = &pyrf_mmap_event__type, + [PERF_RECORD_SAMPLE] = &pyrf_mmap_event__type, +}; + +static PyObject *pyrf_event__new(union perf_event *event) +{ + struct pyrf_event *pevent; + PyTypeObject *ptype; + + if (event->header.type < PERF_RECORD_MMAP || + event->header.type > PERF_RECORD_SAMPLE) + return NULL; + + ptype = pyrf_event__type[event->header.type]; + pevent = PyObject_New(struct pyrf_event, ptype); + if (pevent != NULL) + memcpy(&pevent->event, event, event->header.size); + return (PyObject *)pevent; +} + +struct pyrf_cpu_map { + PyObject_HEAD + + struct cpu_map *cpus; +}; + +static int pyrf_cpu_map__init(struct pyrf_cpu_map *pcpus, + PyObject *args, PyObject *kwargs) +{ + static char *kwlist[] = { "cpustr", NULL, NULL, }; + char *cpustr = NULL; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|s", + kwlist, &cpustr)) + return -1; + + pcpus->cpus = cpu_map__new(cpustr); + if (pcpus->cpus == NULL) + return -1; + return 0; +} + +static void pyrf_cpu_map__delete(struct pyrf_cpu_map *pcpus) +{ + cpu_map__delete(pcpus->cpus); + pcpus->ob_type->tp_free((PyObject*)pcpus); +} + +static Py_ssize_t pyrf_cpu_map__length(PyObject *obj) +{ + struct pyrf_cpu_map *pcpus = (void *)obj; + + return pcpus->cpus->nr; +} + +static PyObject *pyrf_cpu_map__item(PyObject *obj, Py_ssize_t i) +{ + struct pyrf_cpu_map *pcpus = (void *)obj; + + if (i >= pcpus->cpus->nr) + return NULL; + + return Py_BuildValue("i", pcpus->cpus->map[i]); +} + +static PySequenceMethods pyrf_cpu_map__sequence_methods = { + .sq_length = pyrf_cpu_map__length, + .sq_item = pyrf_cpu_map__item, +}; + +static char pyrf_cpu_map__doc[] = PyDoc_STR("cpu map object."); + +static PyTypeObject pyrf_cpu_map__type = { + PyVarObject_HEAD_INIT(NULL, 0) + .tp_name = "perf.cpu_map", + .tp_basicsize = sizeof(struct pyrf_cpu_map), + .tp_dealloc = (destructor)pyrf_cpu_map__delete, + .tp_flags = Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, + .tp_doc = pyrf_cpu_map__doc, + .tp_as_sequence = &pyrf_cpu_map__sequence_methods, + .tp_init = (initproc)pyrf_cpu_map__init, +}; + +static int pyrf_cpu_map__setup_types(void) +{ + pyrf_cpu_map__type.tp_new = PyType_GenericNew; + return PyType_Ready(&pyrf_cpu_map__type); +} + +struct pyrf_thread_map { + PyObject_HEAD + + struct thread_map *threads; +}; + +static int pyrf_thread_map__init(struct pyrf_thread_map *pthreads, + PyObject *args, PyObject *kwargs) +{ + static char *kwlist[] = { "pid", "tid", NULL, NULL, }; + int pid = -1, tid = -1; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ii", + kwlist, &pid, &tid)) + return -1; + + pthreads->threads = thread_map__new(pid, tid); + if (pthreads->threads == NULL) + return -1; + return 0; +} + +static void pyrf_thread_map__delete(struct pyrf_thread_map *pthreads) +{ + thread_map__delete(pthreads->threads); + pthreads->ob_type->tp_free((PyObject*)pthreads); +} + +static Py_ssize_t pyrf_thread_map__length(PyObject *obj) +{ + struct pyrf_thread_map *pthreads = (void *)obj; + + return pthreads->threads->nr; +} + +static PyObject *pyrf_thread_map__item(PyObject *obj, Py_ssize_t i) +{ + struct pyrf_thread_map *pthreads = (void *)obj; + + if (i >= pthreads->threads->nr) + return NULL; + + return Py_BuildValue("i", pthreads->threads->map[i]); +} + +static PySequenceMethods pyrf_thread_map__sequence_methods = { + .sq_length = pyrf_thread_map__length, + .sq_item = pyrf_thread_map__item, +}; + +static char pyrf_thread_map__doc[] = PyDoc_STR("thread map object."); + +static PyTypeObject pyrf_thread_map__type = { + PyVarObject_HEAD_INIT(NULL, 0) + .tp_name = "perf.thread_map", + .tp_basicsize = sizeof(struct pyrf_thread_map), + .tp_dealloc = (destructor)pyrf_thread_map__delete, + .tp_flags = Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, + .tp_doc = pyrf_thread_map__doc, + .tp_as_sequence = &pyrf_thread_map__sequence_methods, + .tp_init = (initproc)pyrf_thread_map__init, +}; + +static int pyrf_thread_map__setup_types(void) +{ + pyrf_thread_map__type.tp_new = PyType_GenericNew; + return PyType_Ready(&pyrf_thread_map__type); +} + +struct pyrf_evsel { + PyObject_HEAD + + struct perf_evsel evsel; +}; + +static int pyrf_evsel__init(struct pyrf_evsel *pevsel, + PyObject *args, PyObject *kwargs) +{ + struct perf_event_attr attr = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .sample_type = PERF_SAMPLE_PERIOD | PERF_SAMPLE_TID, + }; + static char *kwlist[] = { + "type", + "config", + "sample_freq", + "sample_period", + "sample_type", + "read_format", + "disabled", + "inherit", + "pinned", + "exclusive", + "exclude_user", + "exclude_kernel", + "exclude_hv", + "exclude_idle", + "mmap", + "comm", + "freq", + "inherit_stat", + "enable_on_exec", + "task", + "watermark", + "precise_ip", + "mmap_data", + "sample_id_all", + "wakeup_events", + "bp_type", + "bp_addr", + "bp_len", NULL, NULL, }; + u64 sample_period = 0; + u32 disabled = 0, + inherit = 0, + pinned = 0, + exclusive = 0, + exclude_user = 0, + exclude_kernel = 0, + exclude_hv = 0, + exclude_idle = 0, + mmap = 0, + comm = 0, + freq = 1, + inherit_stat = 0, + enable_on_exec = 0, + task = 0, + watermark = 0, + precise_ip = 0, + mmap_data = 0, + sample_id_all = 1; + int idx = 0; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, + "|iKiKKiiiiiiiiiiiiiiiiiiiiiKK", kwlist, + &attr.type, &attr.config, &attr.sample_freq, + &sample_period, &attr.sample_type, + &attr.read_format, &disabled, &inherit, + &pinned, &exclusive, &exclude_user, + &exclude_kernel, &exclude_hv, &exclude_idle, + &mmap, &comm, &freq, &inherit_stat, + &enable_on_exec, &task, &watermark, + &precise_ip, &mmap_data, &sample_id_all, + &attr.wakeup_events, &attr.bp_type, + &attr.bp_addr, &attr.bp_len, &idx)) + return -1; + + /* union... */ + if (sample_period != 0) { + if (attr.sample_freq != 0) + return -1; /* FIXME: throw right exception */ + attr.sample_period = sample_period; + } + + /* Bitfields */ + attr.disabled = disabled; + attr.inherit = inherit; + attr.pinned = pinned; + attr.exclusive = exclusive; + attr.exclude_user = exclude_user; + attr.exclude_kernel = exclude_kernel; + attr.exclude_hv = exclude_hv; + attr.exclude_idle = exclude_idle; + attr.mmap = mmap; + attr.comm = comm; + attr.freq = freq; + attr.inherit_stat = inherit_stat; + attr.enable_on_exec = enable_on_exec; + attr.task = task; + attr.watermark = watermark; + attr.precise_ip = precise_ip; + attr.mmap_data = mmap_data; + attr.sample_id_all = sample_id_all; + + perf_evsel__init(&pevsel->evsel, &attr, idx); + return 0; +} + +static void pyrf_evsel__delete(struct pyrf_evsel *pevsel) +{ + perf_evsel__exit(&pevsel->evsel); + pevsel->ob_type->tp_free((PyObject*)pevsel); +} + +static PyObject *pyrf_evsel__open(struct pyrf_evsel *pevsel, + PyObject *args, PyObject *kwargs) +{ + struct perf_evsel *evsel = &pevsel->evsel; + struct cpu_map *cpus = NULL; + struct thread_map *threads = NULL; + PyObject *pcpus = NULL, *pthreads = NULL; + int group = 0, overwrite = 0; + static char *kwlist[] = {"cpus", "threads", "group", "overwrite", NULL, NULL}; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|OOii", kwlist, + &pcpus, &pthreads, &group, &overwrite)) + return NULL; + + if (pthreads != NULL) + threads = ((struct pyrf_thread_map *)pthreads)->threads; + + if (pcpus != NULL) + cpus = ((struct pyrf_cpu_map *)pcpus)->cpus; + + if (perf_evsel__open(evsel, cpus, threads, group, overwrite) < 0) { + PyErr_SetFromErrno(PyExc_OSError); + return NULL; + } + + Py_INCREF(Py_None); + return Py_None; +} + +static PyMethodDef pyrf_evsel__methods[] = { + { + .ml_name = "open", + .ml_meth = (PyCFunction)pyrf_evsel__open, + .ml_flags = METH_VARARGS | METH_KEYWORDS, + .ml_doc = PyDoc_STR("open the event selector file descriptor table.") + }, + { NULL, } +}; + +static char pyrf_evsel__doc[] = PyDoc_STR("perf event selector list object."); + +static PyTypeObject pyrf_evsel__type = { + PyVarObject_HEAD_INIT(NULL, 0) + .tp_name = "perf.evsel", + .tp_basicsize = sizeof(struct pyrf_evsel), + .tp_dealloc = (destructor)pyrf_evsel__delete, + .tp_flags = Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, + .tp_doc = pyrf_evsel__doc, + .tp_methods = pyrf_evsel__methods, + .tp_init = (initproc)pyrf_evsel__init, +}; + +static int pyrf_evsel__setup_types(void) +{ + pyrf_evsel__type.tp_new = PyType_GenericNew; + return PyType_Ready(&pyrf_evsel__type); +} + +struct pyrf_evlist { + PyObject_HEAD + + struct perf_evlist evlist; +}; + +static int pyrf_evlist__init(struct pyrf_evlist *pevlist, + PyObject *args, PyObject *kwargs) +{ + perf_evlist__init(&pevlist->evlist); + return 0; +} + +static void pyrf_evlist__delete(struct pyrf_evlist *pevlist) +{ + perf_evlist__exit(&pevlist->evlist); + pevlist->ob_type->tp_free((PyObject*)pevlist); +} + +static PyObject *pyrf_evlist__mmap(struct pyrf_evlist *pevlist, + PyObject *args, PyObject *kwargs) +{ + struct perf_evlist *evlist = &pevlist->evlist; + PyObject *pcpus = NULL, *pthreads = NULL; + struct cpu_map *cpus = NULL; + struct thread_map *threads = NULL; + static char *kwlist[] = {"cpus", "threads", "pages", "overwrite", + NULL, NULL}; + int pages = 128, overwrite = false; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|ii", kwlist, + &pcpus, &pthreads, &pages, &overwrite)) + return NULL; + + threads = ((struct pyrf_thread_map *)pthreads)->threads; + cpus = ((struct pyrf_cpu_map *)pcpus)->cpus; + + if (perf_evlist__mmap(evlist, cpus, threads, pages, overwrite) < 0) { + PyErr_SetFromErrno(PyExc_OSError); + return NULL; + } + + Py_INCREF(Py_None); + return Py_None; +} + +static PyObject *pyrf_evlist__poll(struct pyrf_evlist *pevlist, + PyObject *args, PyObject *kwargs) +{ + struct perf_evlist *evlist = &pevlist->evlist; + static char *kwlist[] = {"timeout", NULL, NULL}; + int timeout = -1, n; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|i", kwlist, &timeout)) + return NULL; + + n = poll(evlist->pollfd, evlist->nr_fds, timeout); + if (n < 0) { + PyErr_SetFromErrno(PyExc_OSError); + return NULL; + } + + return Py_BuildValue("i", n); +} + +static PyObject *pyrf_evlist__get_pollfd(struct pyrf_evlist *pevlist, + PyObject *args, PyObject *kwargs) +{ + struct perf_evlist *evlist = &pevlist->evlist; + PyObject *list = PyList_New(0); + int i; + + for (i = 0; i < evlist->nr_fds; ++i) { + PyObject *file; + FILE *fp = fdopen(evlist->pollfd[i].fd, "r"); + + if (fp == NULL) + goto free_list; + + file = PyFile_FromFile(fp, "perf", "r", NULL); + if (file == NULL) + goto free_list; + + if (PyList_Append(list, file) != 0) { + Py_DECREF(file); + goto free_list; + } + + Py_DECREF(file); + } + + return list; +free_list: + return PyErr_NoMemory(); +} + + +static PyObject *pyrf_evlist__add(struct pyrf_evlist *pevlist, + PyObject *args, PyObject *kwargs) +{ + struct perf_evlist *evlist = &pevlist->evlist; + PyObject *pevsel; + struct perf_evsel *evsel; + + if (!PyArg_ParseTuple(args, "O", &pevsel)) + return NULL; + + Py_INCREF(pevsel); + evsel = &((struct pyrf_evsel *)pevsel)->evsel; + evsel->idx = evlist->nr_entries; + perf_evlist__add(evlist, evsel); + + return Py_BuildValue("i", evlist->nr_entries); +} + +static PyObject *pyrf_evlist__read_on_cpu(struct pyrf_evlist *pevlist, + PyObject *args, PyObject *kwargs) +{ + struct perf_evlist *evlist = &pevlist->evlist; + union perf_event *event; + int sample_id_all = 1, cpu; + static char *kwlist[] = {"sample_id_all", NULL, NULL}; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "i|i", kwlist, + &cpu, &sample_id_all)) + return NULL; + + event = perf_evlist__read_on_cpu(evlist, cpu); + if (event != NULL) { + struct perf_evsel *first; + PyObject *pyevent = pyrf_event__new(event); + struct pyrf_event *pevent = (struct pyrf_event *)pyevent; + + if (pyevent == NULL) + return PyErr_NoMemory(); + + first = list_entry(evlist->entries.next, struct perf_evsel, node); + perf_event__parse_sample(event, first->attr.sample_type, sample_id_all, + &pevent->sample); + return pyevent; + } + + Py_INCREF(Py_None); + return Py_None; +} + +static PyMethodDef pyrf_evlist__methods[] = { + { + .ml_name = "mmap", + .ml_meth = (PyCFunction)pyrf_evlist__mmap, + .ml_flags = METH_VARARGS | METH_KEYWORDS, + .ml_doc = PyDoc_STR("mmap the file descriptor table.") + }, + { + .ml_name = "poll", + .ml_meth = (PyCFunction)pyrf_evlist__poll, + .ml_flags = METH_VARARGS | METH_KEYWORDS, + .ml_doc = PyDoc_STR("poll the file descriptor table.") + }, + { + .ml_name = "get_pollfd", + .ml_meth = (PyCFunction)pyrf_evlist__get_pollfd, + .ml_flags = METH_VARARGS | METH_KEYWORDS, + .ml_doc = PyDoc_STR("get the poll file descriptor table.") + }, + { + .ml_name = "add", + .ml_meth = (PyCFunction)pyrf_evlist__add, + .ml_flags = METH_VARARGS | METH_KEYWORDS, + .ml_doc = PyDoc_STR("adds an event selector to the list.") + }, + { + .ml_name = "read_on_cpu", + .ml_meth = (PyCFunction)pyrf_evlist__read_on_cpu, + .ml_flags = METH_VARARGS | METH_KEYWORDS, + .ml_doc = PyDoc_STR("reads an event.") + }, + { NULL, } +}; + +static Py_ssize_t pyrf_evlist__length(PyObject *obj) +{ + struct pyrf_evlist *pevlist = (void *)obj; + + return pevlist->evlist.nr_entries; +} + +static PyObject *pyrf_evlist__item(PyObject *obj, Py_ssize_t i) +{ + struct pyrf_evlist *pevlist = (void *)obj; + struct perf_evsel *pos; + + if (i >= pevlist->evlist.nr_entries) + return NULL; + + list_for_each_entry(pos, &pevlist->evlist.entries, node) + if (i-- == 0) + break; + + return Py_BuildValue("O", container_of(pos, struct pyrf_evsel, evsel)); +} + +static PySequenceMethods pyrf_evlist__sequence_methods = { + .sq_length = pyrf_evlist__length, + .sq_item = pyrf_evlist__item, +}; + +static char pyrf_evlist__doc[] = PyDoc_STR("perf event selector list object."); + +static PyTypeObject pyrf_evlist__type = { + PyVarObject_HEAD_INIT(NULL, 0) + .tp_name = "perf.evlist", + .tp_basicsize = sizeof(struct pyrf_evlist), + .tp_dealloc = (destructor)pyrf_evlist__delete, + .tp_flags = Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, + .tp_as_sequence = &pyrf_evlist__sequence_methods, + .tp_doc = pyrf_evlist__doc, + .tp_methods = pyrf_evlist__methods, + .tp_init = (initproc)pyrf_evlist__init, +}; + +static int pyrf_evlist__setup_types(void) +{ + pyrf_evlist__type.tp_new = PyType_GenericNew; + return PyType_Ready(&pyrf_evlist__type); +} + +static struct { + const char *name; + int value; +} perf__constants[] = { + { "TYPE_HARDWARE", PERF_TYPE_HARDWARE }, + { "TYPE_SOFTWARE", PERF_TYPE_SOFTWARE }, + { "TYPE_TRACEPOINT", PERF_TYPE_TRACEPOINT }, + { "TYPE_HW_CACHE", PERF_TYPE_HW_CACHE }, + { "TYPE_RAW", PERF_TYPE_RAW }, + { "TYPE_BREAKPOINT", PERF_TYPE_BREAKPOINT }, + + { "COUNT_HW_CPU_CYCLES", PERF_COUNT_HW_CPU_CYCLES }, + { "COUNT_HW_INSTRUCTIONS", PERF_COUNT_HW_INSTRUCTIONS }, + { "COUNT_HW_CACHE_REFERENCES", PERF_COUNT_HW_CACHE_REFERENCES }, + { "COUNT_HW_CACHE_MISSES", PERF_COUNT_HW_CACHE_MISSES }, + { "COUNT_HW_BRANCH_INSTRUCTIONS", PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, + { "COUNT_HW_BRANCH_MISSES", PERF_COUNT_HW_BRANCH_MISSES }, + { "COUNT_HW_BUS_CYCLES", PERF_COUNT_HW_BUS_CYCLES }, + { "COUNT_HW_CACHE_L1D", PERF_COUNT_HW_CACHE_L1D }, + { "COUNT_HW_CACHE_L1I", PERF_COUNT_HW_CACHE_L1I }, + { "COUNT_HW_CACHE_LL", PERF_COUNT_HW_CACHE_LL }, + { "COUNT_HW_CACHE_DTLB", PERF_COUNT_HW_CACHE_DTLB }, + { "COUNT_HW_CACHE_ITLB", PERF_COUNT_HW_CACHE_ITLB }, + { "COUNT_HW_CACHE_BPU", PERF_COUNT_HW_CACHE_BPU }, + { "COUNT_HW_CACHE_OP_READ", PERF_COUNT_HW_CACHE_OP_READ }, + { "COUNT_HW_CACHE_OP_WRITE", PERF_COUNT_HW_CACHE_OP_WRITE }, + { "COUNT_HW_CACHE_OP_PREFETCH", PERF_COUNT_HW_CACHE_OP_PREFETCH }, + { "COUNT_HW_CACHE_RESULT_ACCESS", PERF_COUNT_HW_CACHE_RESULT_ACCESS }, + { "COUNT_HW_CACHE_RESULT_MISS", PERF_COUNT_HW_CACHE_RESULT_MISS }, + + { "COUNT_SW_CPU_CLOCK", PERF_COUNT_SW_CPU_CLOCK }, + { "COUNT_SW_TASK_CLOCK", PERF_COUNT_SW_TASK_CLOCK }, + { "COUNT_SW_PAGE_FAULTS", PERF_COUNT_SW_PAGE_FAULTS }, + { "COUNT_SW_CONTEXT_SWITCHES", PERF_COUNT_SW_CONTEXT_SWITCHES }, + { "COUNT_SW_CPU_MIGRATIONS", PERF_COUNT_SW_CPU_MIGRATIONS }, + { "COUNT_SW_PAGE_FAULTS_MIN", PERF_COUNT_SW_PAGE_FAULTS_MIN }, + { "COUNT_SW_PAGE_FAULTS_MAJ", PERF_COUNT_SW_PAGE_FAULTS_MAJ }, + { "COUNT_SW_ALIGNMENT_FAULTS", PERF_COUNT_SW_ALIGNMENT_FAULTS }, + { "COUNT_SW_EMULATION_FAULTS", PERF_COUNT_SW_EMULATION_FAULTS }, + + { "SAMPLE_IP", PERF_SAMPLE_IP }, + { "SAMPLE_TID", PERF_SAMPLE_TID }, + { "SAMPLE_TIME", PERF_SAMPLE_TIME }, + { "SAMPLE_ADDR", PERF_SAMPLE_ADDR }, + { "SAMPLE_READ", PERF_SAMPLE_READ }, + { "SAMPLE_CALLCHAIN", PERF_SAMPLE_CALLCHAIN }, + { "SAMPLE_ID", PERF_SAMPLE_ID }, + { "SAMPLE_CPU", PERF_SAMPLE_CPU }, + { "SAMPLE_PERIOD", PERF_SAMPLE_PERIOD }, + { "SAMPLE_STREAM_ID", PERF_SAMPLE_STREAM_ID }, + { "SAMPLE_RAW", PERF_SAMPLE_RAW }, + + { "FORMAT_TOTAL_TIME_ENABLED", PERF_FORMAT_TOTAL_TIME_ENABLED }, + { "FORMAT_TOTAL_TIME_RUNNING", PERF_FORMAT_TOTAL_TIME_RUNNING }, + { "FORMAT_ID", PERF_FORMAT_ID }, + { "FORMAT_GROUP", PERF_FORMAT_GROUP }, + + { "RECORD_MMAP", PERF_RECORD_MMAP }, + { "RECORD_LOST", PERF_RECORD_LOST }, + { "RECORD_COMM", PERF_RECORD_COMM }, + { "RECORD_EXIT", PERF_RECORD_EXIT }, + { "RECORD_THROTTLE", PERF_RECORD_THROTTLE }, + { "RECORD_UNTHROTTLE", PERF_RECORD_UNTHROTTLE }, + { "RECORD_FORK", PERF_RECORD_FORK }, + { "RECORD_READ", PERF_RECORD_READ }, + { "RECORD_SAMPLE", PERF_RECORD_SAMPLE }, + { NULL, }, +}; + +static PyMethodDef perf__methods[] = { + { NULL, NULL } +}; + +PyMODINIT_FUNC initperf(void) +{ + PyObject *obj; + int i; + PyObject *dict, *module = Py_InitModule("perf", perf__methods); + + if (module == NULL || + pyrf_event__setup_types() < 0 || + pyrf_evlist__setup_types() < 0 || + pyrf_evsel__setup_types() < 0 || + pyrf_thread_map__setup_types() < 0 || + pyrf_cpu_map__setup_types() < 0) + return; + + Py_INCREF(&pyrf_evlist__type); + PyModule_AddObject(module, "evlist", (PyObject*)&pyrf_evlist__type); + + Py_INCREF(&pyrf_evsel__type); + PyModule_AddObject(module, "evsel", (PyObject*)&pyrf_evsel__type); + + Py_INCREF(&pyrf_thread_map__type); + PyModule_AddObject(module, "thread_map", (PyObject*)&pyrf_thread_map__type); + + Py_INCREF(&pyrf_cpu_map__type); + PyModule_AddObject(module, "cpu_map", (PyObject*)&pyrf_cpu_map__type); + + dict = PyModule_GetDict(module); + if (dict == NULL) + goto error; + + for (i = 0; perf__constants[i].name != NULL; i++) { + obj = PyInt_FromLong(perf__constants[i].value); + if (obj == NULL) + goto error; + PyDict_SetItemString(dict, perf__constants[i].name, obj); + Py_DECREF(obj); + } + +error: + if (PyErr_Occurred()) + PyErr_SetString(PyExc_ImportError, "perf: Init failed!"); +} diff --git a/tools/perf/util/setup.py b/tools/perf/util/setup.py new file mode 100644 index 000000000000..496d7f432fb8 --- /dev/null +++ b/tools/perf/util/setup.py @@ -0,0 +1,18 @@ +#!/usr/bin/python2 + +from distutils.core import setup, Extension + +perf = Extension('perf', + sources = ['util/python.c', 'util/ctype.c', 'util/evlist.c', + 'util/evsel.c', 'util/cpumap.c', 'util/thread_map.c', + 'util/util.c', 'util/xyarray.c'], + include_dirs = ['util/include']) + +setup(name='perf', + version='0.1', + description='Interface with the Linux profiling infrastructure', + author='Arnaldo Carvalho de Melo', + author_email='acme@redhat.com', + license='GPLv2', + url='http://perf.wiki.kernel.org', + ext_modules=[perf]) From f8a9530939ed87b9a1b1a038b90e355098b679a2 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 30 Jan 2011 10:46:46 -0200 Subject: [PATCH 099/706] perf evlist: Move evlist methods to evlist.c They were on evsel.c because they came from refactoring existing evsel methods, so, to make reviewing the changes easier, I kept it there, now its a plain move. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 142 ++++++++++++++++++++++++++++++++++++++ tools/perf/util/evlist.h | 7 ++ tools/perf/util/evsel.c | 144 +++------------------------------------ tools/perf/util/evsel.h | 4 -- 4 files changed, 158 insertions(+), 139 deletions(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 917fc18d0bed..dcd59328bb49 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1,11 +1,26 @@ +/* + * Copyright (C) 2011, Red Hat Inc, Arnaldo Carvalho de Melo + * + * Parts came from builtin-{top,stat,record}.c, see those files for further + * copyright notes. + * + * Released under the GPL v2. (and only v2, not any later version) + */ #include +#include "cpumap.h" +#include "thread_map.h" #include "evlist.h" #include "evsel.h" #include "util.h" +#include + #include #include +#define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y)) +#define SID(e, x, y) xyarray__entry(e->id, x, y) + void perf_evlist__init(struct perf_evlist *evlist) { int i; @@ -88,6 +103,30 @@ void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd) evlist->nr_fds++; } +static int perf_evlist__id_hash(struct perf_evlist *evlist, struct perf_evsel *evsel, + int cpu, int thread, int fd) +{ + struct perf_sample_id *sid; + u64 read_data[4] = { 0, }; + int hash, id_idx = 1; /* The first entry is the counter value */ + + if (!(evsel->attr.read_format & PERF_FORMAT_ID) || + read(fd, &read_data, sizeof(read_data)) == -1) + return -1; + + if (evsel->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + ++id_idx; + if (evsel->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + ++id_idx; + + sid = SID(evsel, cpu, thread); + sid->id = read_data[id_idx]; + sid->evsel = evsel; + hash = hash_64(sid->id, PERF_EVLIST__HLIST_BITS); + hlist_add_head(&sid->node, &evlist->heads[hash]); + return 0; +} + struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id) { struct hlist_head *head; @@ -173,3 +212,106 @@ union perf_event *perf_evlist__read_on_cpu(struct perf_evlist *evlist, int cpu) return event; } + +void perf_evlist__munmap(struct perf_evlist *evlist, int ncpus) +{ + int cpu; + + for (cpu = 0; cpu < ncpus; cpu++) { + if (evlist->mmap[cpu].base != NULL) { + munmap(evlist->mmap[cpu].base, evlist->mmap_len); + evlist->mmap[cpu].base = NULL; + } + } +} + +int perf_evlist__alloc_mmap(struct perf_evlist *evlist, int ncpus) +{ + evlist->mmap = zalloc(ncpus * sizeof(struct perf_mmap)); + return evlist->mmap != NULL ? 0 : -ENOMEM; +} + +static int __perf_evlist__mmap(struct perf_evlist *evlist, int cpu, int prot, + int mask, int fd) +{ + evlist->mmap[cpu].prev = 0; + evlist->mmap[cpu].mask = mask; + evlist->mmap[cpu].base = mmap(NULL, evlist->mmap_len, prot, + MAP_SHARED, fd, 0); + if (evlist->mmap[cpu].base == MAP_FAILED) + return -1; + + perf_evlist__add_pollfd(evlist, fd); + return 0; +} + +/** perf_evlist__mmap - Create per cpu maps to receive events + * + * @evlist - list of events + * @cpus - cpu map being monitored + * @threads - threads map being monitored + * @pages - map length in pages + * @overwrite - overwrite older events? + * + * If overwrite is false the user needs to signal event consuption using: + * + * struct perf_mmap *m = &evlist->mmap[cpu]; + * unsigned int head = perf_mmap__read_head(m); + * + * perf_mmap__write_tail(m, head) + */ +int perf_evlist__mmap(struct perf_evlist *evlist, struct cpu_map *cpus, + struct thread_map *threads, int pages, bool overwrite) +{ + unsigned int page_size = sysconf(_SC_PAGE_SIZE); + int mask = pages * page_size - 1, cpu; + struct perf_evsel *first_evsel, *evsel; + int thread, prot = PROT_READ | (overwrite ? 0 : PROT_WRITE); + + if (evlist->mmap == NULL && + perf_evlist__alloc_mmap(evlist, cpus->nr) < 0) + return -ENOMEM; + + if (evlist->pollfd == NULL && + perf_evlist__alloc_pollfd(evlist, cpus->nr, threads->nr) < 0) + return -ENOMEM; + + evlist->overwrite = overwrite; + evlist->mmap_len = (pages + 1) * page_size; + first_evsel = list_entry(evlist->entries.next, struct perf_evsel, node); + + list_for_each_entry(evsel, &evlist->entries, node) { + if ((evsel->attr.read_format & PERF_FORMAT_ID) && + evsel->id == NULL && + perf_evsel__alloc_id(evsel, cpus->nr, threads->nr) < 0) + return -ENOMEM; + + for (cpu = 0; cpu < cpus->nr; cpu++) { + for (thread = 0; thread < threads->nr; thread++) { + int fd = FD(evsel, cpu, thread); + + if (evsel->idx || thread) { + if (ioctl(fd, PERF_EVENT_IOC_SET_OUTPUT, + FD(first_evsel, cpu, 0)) != 0) + goto out_unmap; + } else if (__perf_evlist__mmap(evlist, cpu, prot, mask, fd) < 0) + goto out_unmap; + + if ((evsel->attr.read_format & PERF_FORMAT_ID) && + perf_evlist__id_hash(evlist, evsel, cpu, thread, fd) < 0) + goto out_unmap; + } + } + } + + return 0; + +out_unmap: + for (cpu = 0; cpu < cpus->nr; cpu++) { + if (evlist->mmap[cpu].base != NULL) { + munmap(evlist->mmap[cpu].base, evlist->mmap_len); + evlist->mmap[cpu].base = NULL; + } + } + return -1; +} diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 022ae404b908..85aca6eba16b 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -6,6 +6,8 @@ #include "event.h" struct pollfd; +struct thread_map; +struct cpu_map; #define PERF_EVLIST__HLIST_BITS 8 #define PERF_EVLIST__HLIST_SIZE (1 << PERF_EVLIST__HLIST_BITS) @@ -39,4 +41,9 @@ struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id); union perf_event *perf_evlist__read_on_cpu(struct perf_evlist *self, int cpu); +int perf_evlist__alloc_mmap(struct perf_evlist *evlist, int ncpus); +int perf_evlist__mmap(struct perf_evlist *evlist, struct cpu_map *cpus, + struct thread_map *threads, int pages, bool overwrite); +void perf_evlist__munmap(struct perf_evlist *evlist, int ncpus); + #endif /* __PERF_EVLIST_H */ diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index fddeb08f48a7..2720bc1d578b 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1,18 +1,19 @@ +/* + * Copyright (C) 2011, Red Hat Inc, Arnaldo Carvalho de Melo + * + * Parts came from builtin-{top,stat,record}.c, see those files for further + * copyright notes. + * + * Released under the GPL v2. (and only v2, not any later version) + */ + #include "evsel.h" #include "evlist.h" -#include "../perf.h" #include "util.h" #include "cpumap.h" #include "thread_map.h" -#include -#include - -#include -#include - #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y)) -#define SID(e, x, y) xyarray__entry(e->id, x, y) void perf_evsel__init(struct perf_evsel *evsel, struct perf_event_attr *attr, int idx) @@ -74,24 +75,6 @@ void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads) } } -void perf_evlist__munmap(struct perf_evlist *evlist, int ncpus) -{ - int cpu; - - for (cpu = 0; cpu < ncpus; cpu++) { - if (evlist->mmap[cpu].base != NULL) { - munmap(evlist->mmap[cpu].base, evlist->mmap_len); - evlist->mmap[cpu].base = NULL; - } - } -} - -int perf_evlist__alloc_mmap(struct perf_evlist *evlist, int ncpus) -{ - evlist->mmap = zalloc(ncpus * sizeof(struct perf_mmap)); - return evlist->mmap != NULL ? 0 : -ENOMEM; -} - void perf_evsel__exit(struct perf_evsel *evsel) { assert(list_empty(&evsel->node)); @@ -258,115 +241,6 @@ int perf_evsel__open_per_thread(struct perf_evsel *evsel, return __perf_evsel__open(evsel, &empty_cpu_map.map, threads, group, inherit); } -static int __perf_evlist__mmap(struct perf_evlist *evlist, int cpu, int prot, - int mask, int fd) -{ - evlist->mmap[cpu].prev = 0; - evlist->mmap[cpu].mask = mask; - evlist->mmap[cpu].base = mmap(NULL, evlist->mmap_len, prot, - MAP_SHARED, fd, 0); - if (evlist->mmap[cpu].base == MAP_FAILED) - return -1; - - perf_evlist__add_pollfd(evlist, fd); - return 0; -} - -static int perf_evlist__id_hash(struct perf_evlist *evlist, struct perf_evsel *evsel, - int cpu, int thread, int fd) -{ - struct perf_sample_id *sid; - u64 read_data[4] = { 0, }; - int hash, id_idx = 1; /* The first entry is the counter value */ - - if (!(evsel->attr.read_format & PERF_FORMAT_ID) || - read(fd, &read_data, sizeof(read_data)) == -1) - return -1; - - if (evsel->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - ++id_idx; - if (evsel->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - ++id_idx; - - sid = SID(evsel, cpu, thread); - sid->id = read_data[id_idx]; - sid->evsel = evsel; - hash = hash_64(sid->id, PERF_EVLIST__HLIST_BITS); - hlist_add_head(&sid->node, &evlist->heads[hash]); - return 0; -} - -/** perf_evlist__mmap - Create per cpu maps to receive events - * - * @evlist - list of events - * @cpus - cpu map being monitored - * @threads - threads map being monitored - * @pages - map length in pages - * @overwrite - overwrite older events? - * - * If overwrite is false the user needs to signal event consuption using: - * - * struct perf_mmap *m = &evlist->mmap[cpu]; - * unsigned int head = perf_mmap__read_head(m); - * - * perf_mmap__write_tail(m, head) - */ -int perf_evlist__mmap(struct perf_evlist *evlist, struct cpu_map *cpus, - struct thread_map *threads, int pages, bool overwrite) -{ - unsigned int page_size = sysconf(_SC_PAGE_SIZE); - int mask = pages * page_size - 1, cpu; - struct perf_evsel *first_evsel, *evsel; - int thread, prot = PROT_READ | (overwrite ? 0 : PROT_WRITE); - - if (evlist->mmap == NULL && - perf_evlist__alloc_mmap(evlist, cpus->nr) < 0) - return -ENOMEM; - - if (evlist->pollfd == NULL && - perf_evlist__alloc_pollfd(evlist, cpus->nr, threads->nr) < 0) - return -ENOMEM; - - evlist->overwrite = overwrite; - evlist->mmap_len = (pages + 1) * page_size; - first_evsel = list_entry(evlist->entries.next, struct perf_evsel, node); - - list_for_each_entry(evsel, &evlist->entries, node) { - if ((evsel->attr.read_format & PERF_FORMAT_ID) && - evsel->id == NULL && - perf_evsel__alloc_id(evsel, cpus->nr, threads->nr) < 0) - return -ENOMEM; - - for (cpu = 0; cpu < cpus->nr; cpu++) { - for (thread = 0; thread < threads->nr; thread++) { - int fd = FD(evsel, cpu, thread); - - if (evsel->idx || thread) { - if (ioctl(fd, PERF_EVENT_IOC_SET_OUTPUT, - FD(first_evsel, cpu, 0)) != 0) - goto out_unmap; - } else if (__perf_evlist__mmap(evlist, cpu, prot, mask, fd) < 0) - goto out_unmap; - - if ((evsel->attr.read_format & PERF_FORMAT_ID) && - perf_evlist__id_hash(evlist, evsel, cpu, thread, fd) < 0) - goto out_unmap; - } - } - } - - return 0; - -out_unmap: - for (cpu = 0; cpu < cpus->nr; cpu++) { - if (evlist->mmap[cpu].base != NULL) { - munmap(evlist->mmap[cpu].base, evlist->mmap_len); - evlist->mmap[cpu].base = NULL; - } - } - return -1; -} - static int perf_event__parse_id_sample(const union perf_event *event, u64 type, struct perf_sample *sample) { diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 7962e7587dea..eecdc3aabc14 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -60,7 +60,6 @@ void perf_evsel__delete(struct perf_evsel *evsel); int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads); int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads); int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus); -int perf_evlist__alloc_mmap(struct perf_evlist *evlist, int ncpus); void perf_evsel__free_fd(struct perf_evsel *evsel); void perf_evsel__free_id(struct perf_evsel *evsel); void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads); @@ -71,9 +70,6 @@ int perf_evsel__open_per_thread(struct perf_evsel *evsel, struct thread_map *threads, bool group, bool inherit); int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus, struct thread_map *threads, bool group, bool inherit); -int perf_evlist__mmap(struct perf_evlist *evlist, struct cpu_map *cpus, - struct thread_map *threads, int pages, bool overwrite); -void perf_evlist__munmap(struct perf_evlist *evlist, int ncpus); #define perf_evsel__match(evsel, t, c) \ (evsel->attr.type == PERF_TYPE_##t && \ From 1fb0ef31f428f345a7c3666f8e7444a563edd537 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 31 Jan 2011 08:57:41 +0100 Subject: [PATCH 100/706] genirq: Fix affinity notifier fallout The new code of commit cd7eab44e(genirq: Add IRQ affinity notifiers) references irq_desc.affinity which fails to compile with CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED=y. Use irq_desc.irq_data.affinity instead. Signed-off-by: Thomas Gleixner Cc: Ben Hutchings --- kernel/irq/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0587c5ceaed8..538fce2db51c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -179,7 +179,7 @@ static void irq_affinity_notify(struct work_struct *work) cpumask_copy(cpumask, desc->pending_mask); else #endif - cpumask_copy(cpumask, desc->affinity); + cpumask_copy(cpumask, desc->irq_data.affinity); raw_spin_unlock_irqrestore(&desc->lock, flags); notify->notify(notify, cpumask); From 871cf1e5f2a17702f58539a3af8b18fc8666ad4c Mon Sep 17 00:00:00 2001 From: Torben Hohn Date: Thu, 27 Jan 2011 15:58:55 +0100 Subject: [PATCH 101/706] time: Move do_timer() to kernel/time/timekeeping.c do_timer() is primary timekeeping related. calc_global_load() is called from do_timer() as well, but that's more for historical reasons. [ tglx: Fixed up the calc_global_load() reject andmassaged changelog ] Signed-off-by: Torben Hohn Cc: Peter Zijlstra Cc: johnstul@us.ibm.com Cc: yong.zhang0@gmail.com Cc: hch@infradead.org LKML-Reference: <20110127145855.23248.56933.stgit@localhost> Signed-off-by: Thomas Gleixner --- include/linux/time.h | 1 - kernel/time/timekeeping.c | 14 +++++++++++++- kernel/timer.c | 13 ------------- 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/include/linux/time.h b/include/linux/time.h index 1e6d3b59238d..86a9c487fdd8 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -166,7 +166,6 @@ extern void monotonic_to_bootbased(struct timespec *ts); extern struct timespec timespec_trunc(struct timespec t, unsigned gran); extern int timekeeping_valid_for_hres(void); extern u64 timekeeping_max_deferment(void); -extern void update_wall_time(void); extern void timekeeping_leap_insert(int leapsecond); struct tms; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d27c7562902c..c1a178ca0f50 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -779,7 +779,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) * * Called from the timer interrupt, must hold a write on xtime_lock. */ -void update_wall_time(void) +static void update_wall_time(void) { struct clocksource *clock; cycle_t offset; @@ -946,3 +946,15 @@ struct timespec get_monotonic_coarse(void) now.tv_nsec + mono.tv_nsec); return now; } + +/* + * The 64-bit jiffies value is not atomic - you MUST NOT read it + * without sampling the sequence number in xtime_lock. + * jiffies is defined in the linker script... + */ +void do_timer(unsigned long ticks) +{ + jiffies_64 += ticks; + update_wall_time(); + calc_global_load(ticks); +} diff --git a/kernel/timer.c b/kernel/timer.c index 43ca9936f2d0..87f656cc2a55 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1293,19 +1293,6 @@ void run_local_timers(void) raise_softirq(TIMER_SOFTIRQ); } -/* - * The 64-bit jiffies value is not atomic - you MUST NOT read it - * without sampling the sequence number in xtime_lock. - * jiffies is defined in the linker script... - */ - -void do_timer(unsigned long ticks) -{ - jiffies_64 += ticks; - update_wall_time(); - calc_global_load(ticks); -} - #ifdef __ARCH_WANT_SYS_ALARM /* From fbad1ea94159a71bc0f68b00e57ae803606af9fb Mon Sep 17 00:00:00 2001 From: Torben Hohn Date: Thu, 27 Jan 2011 15:59:00 +0100 Subject: [PATCH 102/706] time: Move get_jiffies_64 to kernel/time/jiffies.c Move the jiffies access functions to the jiffies clocksource code. [ tglx: Add missing include ] Signed-off-by: Torben Hohn Cc: Peter Zijlstra Cc: johnstul@us.ibm.com Cc: yong.zhang0@gmail.com Cc: hch@infradead.org LKML-Reference: <20110127145900.23248.73352.stgit@localhost> Signed-off-by: Thomas Gleixner --- kernel/time.c | 17 ----------------- kernel/time/jiffies.c | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/kernel/time.c b/kernel/time.c index 32174359576f..a31b51220ac6 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -674,23 +674,6 @@ unsigned long nsecs_to_jiffies(u64 n) #endif } -#if (BITS_PER_LONG < 64) -u64 get_jiffies_64(void) -{ - unsigned long seq; - u64 ret; - - do { - seq = read_seqbegin(&xtime_lock); - ret = jiffies_64; - } while (read_seqretry(&xtime_lock, seq)); - return ret; -} -EXPORT_SYMBOL(get_jiffies_64); -#endif - -EXPORT_SYMBOL(jiffies); - /* * Add two timespec values and do a safety check for overflow. * It's assumed that both values are valid (>= 0) diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 5404a8456909..2fbc20744797 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -22,6 +22,7 @@ ************************************************************************/ #include #include +#include #include /* The Jiffies based clocksource is the lowest common @@ -64,6 +65,23 @@ struct clocksource clocksource_jiffies = { .shift = JIFFIES_SHIFT, }; +#if (BITS_PER_LONG < 64) +u64 get_jiffies_64(void) +{ + unsigned long seq; + u64 ret; + + do { + seq = read_seqbegin(&xtime_lock); + ret = jiffies_64; + } while (read_seqretry(&xtime_lock, seq)); + return ret; +} +EXPORT_SYMBOL(get_jiffies_64); +#endif + +EXPORT_SYMBOL(jiffies); + static int __init init_jiffies_clocksource(void) { return clocksource_register(&clocksource_jiffies); From 48cf76f7104f655bbd48a75c7759dce82c3e1ab6 Mon Sep 17 00:00:00 2001 From: Torben Hohn Date: Thu, 27 Jan 2011 15:59:05 +0100 Subject: [PATCH 103/706] time: Provide get_xtime_and_monotonic_offset() The hrtimer code accesses timekeeping variables under xtime_lock. Provide a sensible accessor function and use it. [ tglx: Removed the conditionals, unused variable, fixed codingstyle and massaged changelog ] Signed-off-by: Torben Hohn Cc: Peter Zijlstra Cc: johnstul@us.ibm.com Cc: yong.zhang0@gmail.com Cc: hch@infradead.org LKML-Reference: <20110127145905.23248.30458.stgit@localhost> Signed-off-by: Thomas Gleixner --- include/linux/time.h | 1 + kernel/hrtimer.c | 13 ++----------- kernel/time/timekeeping.c | 16 ++++++++++++++++ 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/include/linux/time.h b/include/linux/time.h index 86a9c487fdd8..4007a12a1b50 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -127,6 +127,7 @@ struct timespec current_kernel_time(void); struct timespec __current_kernel_time(void); /* does not take xtime_lock */ struct timespec __get_wall_to_monotonic(void); /* does not take xtime_lock */ struct timespec get_monotonic_coarse(void); +void get_xtime_and_monotonic_offset(struct timespec *xtim, struct timespec *wtom); #define CURRENT_TIME (current_kernel_time()) #define CURRENT_TIME_SEC ((struct timespec) { get_seconds(), 0 }) diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 0c8d7c048615..57c4d33c9a9d 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -85,13 +85,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) { ktime_t xtim, tomono; struct timespec xts, tom; - unsigned long seq; - do { - seq = read_seqbegin(&xtime_lock); - xts = __current_kernel_time(); - tom = __get_wall_to_monotonic(); - } while (read_seqretry(&xtime_lock, seq)); + get_xtime_and_monotonic_offset(&xts, &tom); xtim = timespec_to_ktime(xts); tomono = timespec_to_ktime(tom); @@ -612,15 +607,11 @@ static void retrigger_next_event(void *arg) { struct hrtimer_cpu_base *base; struct timespec realtime_offset, wtm; - unsigned long seq; if (!hrtimer_hres_active()) return; - do { - seq = read_seqbegin(&xtime_lock); - wtm = __get_wall_to_monotonic(); - } while (read_seqretry(&xtime_lock, seq)); + get_xtime_and_monotonic_offset(&realtime_offset, &wtm); set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); base = &__get_cpu_var(hrtimer_bases); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index c1a178ca0f50..c50aaf6cd01d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -958,3 +958,19 @@ void do_timer(unsigned long ticks) update_wall_time(); calc_global_load(ticks); } + +/** + * get_xtime_and_monotonic_offset() - get xtime and wall_to_monotonic + * @xtim: pointer to timespec to be set with xtime + * @wtom: pointer to timespec to be set with wall_to_monotonic + */ +void get_xtime_and_monotonic_offset(struct timespec *xtim, struct timespec *wtom) +{ + unsigned long seq; + + do { + seq = read_seqbegin(&xtime_lock); + *xtim = xtime; + *wtom = wall_to_monotonic; + } while (read_seqretry(&xtime_lock, seq)); +} From 79ecaf0d15344d78904becf0f25de3fc9b49d430 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 31 Jan 2011 11:07:54 +0100 Subject: [PATCH 104/706] time: Remove unused __get_wall_to_monotonic() No users left. Remove it. Signed-off-by: Thomas Gleixner --- include/linux/time.h | 1 - kernel/time/timekeeping.c | 5 ----- 2 files changed, 6 deletions(-) diff --git a/include/linux/time.h b/include/linux/time.h index 4007a12a1b50..ce29c86882b1 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -125,7 +125,6 @@ extern int timekeeping_suspended; unsigned long get_seconds(void); struct timespec current_kernel_time(void); struct timespec __current_kernel_time(void); /* does not take xtime_lock */ -struct timespec __get_wall_to_monotonic(void); /* does not take xtime_lock */ struct timespec get_monotonic_coarse(void); void get_xtime_and_monotonic_offset(struct timespec *xtim, struct timespec *wtom); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index c50aaf6cd01d..8da35d1b9e16 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -910,11 +910,6 @@ struct timespec __current_kernel_time(void) return xtime; } -struct timespec __get_wall_to_monotonic(void) -{ - return wall_to_monotonic; -} - struct timespec current_kernel_time(void) { struct timespec now; From f0af911a9dec9de702645182c8d269449e24d24b Mon Sep 17 00:00:00 2001 From: Torben Hohn Date: Thu, 27 Jan 2011 15:59:10 +0100 Subject: [PATCH 105/706] time: Provide xtime_update() xtime_update() takes xtime_lock write locked and calls do_timer(). Provided to replace the do_timer() calls in the architecture code. Signed-off-by: Torben Hohn Cc: Peter Zijlstra Cc: johnstul@us.ibm.com Cc: yong.zhang0@gmail.com Cc: hch@infradead.org LKML-Reference: <20110127145910.23248.21379.stgit@localhost> Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 1 + kernel/time/timekeeping.c | 13 +++++++++++++ 2 files changed, 14 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index d747f948b34e..9d9a0787eed3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2050,6 +2050,7 @@ extern void release_uids(struct user_namespace *ns); #include extern void do_timer(unsigned long ticks); +extern void xtime_update(unsigned long ticks); extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_process(struct task_struct *tsk); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 8da35d1b9e16..02c13a313d15 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -969,3 +969,16 @@ void get_xtime_and_monotonic_offset(struct timespec *xtim, struct timespec *wtom *wtom = wall_to_monotonic; } while (read_seqretry(&xtime_lock, seq)); } + +/** + * xtime_update() - advances the timekeeping infrastructure + * @ticks: number of ticks, that have elapsed since the last call. + * + * Must be called with interrupts disabled. + */ +void xtime_update(unsigned long ticks) +{ + write_seqlock(&xtime_lock); + do_timer(ticks); + write_sequnlock(&xtime_lock); +} From 1340f3e0b29b745a33f431455c3a37f48197bc81 Mon Sep 17 00:00:00 2001 From: Torben Hohn Date: Thu, 27 Jan 2011 15:59:15 +0100 Subject: [PATCH 106/706] alpha: Change do_timer() to xtime_update() xtime_update() takes the xtime_lock itself. timer_interrupt() is only called on the boot cpu. See do_entInt(). So "state" in timer_interrupt does not require protection by xtime_lock. Signed-off-by: Torben Hohn Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Peter Zijlstra Cc: johnstul@us.ibm.com Cc: hch@infradead.org Cc: yong.zhang0@gmail.com LKML-Reference: <20110127145915.23248.20919.stgit@localhost> Signed-off-by: Thomas Gleixner --- arch/alpha/kernel/time.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c index c1f3e7cb82a4..a58e84f1a63b 100644 --- a/arch/alpha/kernel/time.c +++ b/arch/alpha/kernel/time.c @@ -159,7 +159,7 @@ void read_persistent_clock(struct timespec *ts) /* * timer_interrupt() needs to keep up the real-time clock, - * as well as call the "do_timer()" routine every clocktick + * as well as call the "xtime_update()" routine every clocktick */ irqreturn_t timer_interrupt(int irq, void *dev) { @@ -172,8 +172,6 @@ irqreturn_t timer_interrupt(int irq, void *dev) profile_tick(CPU_PROFILING); #endif - write_seqlock(&xtime_lock); - /* * Calculate how many ticks have passed since the last update, * including any previous partial leftover. Save any resulting @@ -187,9 +185,7 @@ irqreturn_t timer_interrupt(int irq, void *dev) nticks = delta >> FIX_SHIFT; if (nticks) - do_timer(nticks); - - write_sequnlock(&xtime_lock); + xtime_update(nticks); if (test_irq_work_pending()) { clear_irq_work_pending(); From 6906e33cc555c390cd091f6f363b783322dfedf6 Mon Sep 17 00:00:00 2001 From: Torben Hohn Date: Thu, 27 Jan 2011 15:59:21 +0100 Subject: [PATCH 107/706] arm: Switch from do_timer() to xtime_update() xtime_update takes the xtime_lock itself. Signed-off-by: Torben Hohn Cc: Russell King Cc: Peter Zijlstra Cc: johnstul@us.ibm.com Cc: hch@infradead.org Cc: yong.zhang0@gmail.com LKML-Reference: <20110127145920.23248.75541.stgit@localhost> Signed-off-by: Thomas Gleixner --- arch/arm/kernel/time.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c index 3d76bf233734..1ff46cabc7ef 100644 --- a/arch/arm/kernel/time.c +++ b/arch/arm/kernel/time.c @@ -107,9 +107,7 @@ void timer_tick(void) { profile_tick(CPU_PROFILING); do_leds(); - write_seqlock(&xtime_lock); - do_timer(1); - write_sequnlock(&xtime_lock); + xtime_update(1); #ifndef CONFIG_SMP update_process_times(user_mode(get_irq_regs())); #endif From ec2dff2febf19ff2109c2eb3e56d5a969fe399e2 Mon Sep 17 00:00:00 2001 From: Torben Hohn Date: Thu, 27 Jan 2011 15:59:26 +0100 Subject: [PATCH 108/706] arm/mach-clps711x: Switch do_timer() to xtime_update() do_timer() requires holding the xtime_lock, which this code did not do. Use the safe version xtime_update() Signed-off-by: Torben Hohn Cc: Russell King Cc: Peter Zijlstra Cc: johnstul@us.ibm.com Cc: hch@infradead.org Cc: yong.zhang0@gmail.com LKML-Reference: <20110127145926.23248.56369.stgit@localhost> Signed-off-by: Thomas Gleixner --- arch/arm/mach-clps711x/include/mach/time.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mach-clps711x/include/mach/time.h b/arch/arm/mach-clps711x/include/mach/time.h index 8fe283ccd1f3..61fef9129c6a 100644 --- a/arch/arm/mach-clps711x/include/mach/time.h +++ b/arch/arm/mach-clps711x/include/mach/time.h @@ -30,7 +30,7 @@ p720t_timer_interrupt(int irq, void *dev_id) { struct pt_regs *regs = get_irq_regs(); do_leds(); - do_timer(1); + xtime_update(1); #ifndef CONFIG_SMP update_process_times(user_mode(regs)); #endif From 4196b892d55caaf2c98da05e80472ca482ca19fe Mon Sep 17 00:00:00 2001 From: Torben Hohn Date: Thu, 27 Jan 2011 15:59:31 +0100 Subject: [PATCH 109/706] blackfin: Switch from do_timer() to xtime_update() xtime_update() takes the xtime_lock itself. Signed-off-by: Torben Hohn Cc: Mike Frysinger Cc: Peter Zijlstra Cc: johnstul@us.ibm.com Cc: hch@infradead.org Cc: yong.zhang0@gmail.com LKML-Reference: <20110127145931.23248.33917.stgit@localhost> Signed-off-by: Thomas Gleixner --- arch/blackfin/kernel/time.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/blackfin/kernel/time.c b/arch/blackfin/kernel/time.c index c9113619029f..8d73724c0092 100644 --- a/arch/blackfin/kernel/time.c +++ b/arch/blackfin/kernel/time.c @@ -114,16 +114,14 @@ u32 arch_gettimeoffset(void) /* * timer_interrupt() needs to keep up the real-time clock, - * as well as call the "do_timer()" routine every clocktick + * as well as call the "xtime_update()" routine every clocktick */ #ifdef CONFIG_CORE_TIMER_IRQ_L1 __attribute__((l1_text)) #endif irqreturn_t timer_interrupt(int irq, void *dummy) { - write_seqlock(&xtime_lock); - do_timer(1); - write_sequnlock(&xtime_lock); + xtime_update(1); #ifdef CONFIG_IPIPE update_root_process_times(get_irq_regs()); From 17588b99183ece563013622afeefd37eb8e68fd3 Mon Sep 17 00:00:00 2001 From: Torben Hohn Date: Thu, 27 Jan 2011 15:59:36 +0100 Subject: [PATCH 110/706] cris: arch-v10: Switch do_timer() to xtime_update() This code failed to take the xtime_lock, which must be held when calling do_timer(). Use the safe version xtime_update() Signed-off-by: Torben Hohn Cc: hch@infradead.org Cc: Jesper Nilsson Cc: Peter Zijlstra Cc: johnstul@us.ibm.com Cc: Mikael Starvik Cc: yong.zhang0@gmail.com LKML-Reference: <20110127145936.23248.16192.stgit@localhost> Signed-off-by: Thomas Gleixner --- arch/cris/arch-v10/kernel/time.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/cris/arch-v10/kernel/time.c b/arch/cris/arch-v10/kernel/time.c index 00eb36f8debf..20c85b5dc7d0 100644 --- a/arch/cris/arch-v10/kernel/time.c +++ b/arch/cris/arch-v10/kernel/time.c @@ -140,7 +140,7 @@ stop_watchdog(void) /* * timer_interrupt() needs to keep up the real-time clock, - * as well as call the "do_timer()" routine every clocktick + * as well as call the "xtime_update()" routine every clocktick */ //static unsigned short myjiff; /* used by our debug routine print_timestamp */ @@ -176,7 +176,7 @@ timer_interrupt(int irq, void *dev_id) /* call the real timer interrupt handler */ - do_timer(1); + xtime_update(1); cris_do_profile(regs); /* Save profiling information */ return IRQ_HANDLED; From 36cb07bb8118cb14211ef25c58026f005877c47d Mon Sep 17 00:00:00 2001 From: Torben Hohn Date: Thu, 27 Jan 2011 15:59:41 +0100 Subject: [PATCH 111/706] cris: arch-v32: Switch do_timer() to xtime_update() xtime_update() takes the xtime_lock itself. Signed-off-by: Torben Hohn Cc: hch@infradead.org Cc: Jesper Nilsson Cc: Peter Zijlstra Cc: johnstul@us.ibm.com Cc: Mikael Starvik Cc: yong.zhang0@gmail.com LKML-Reference: <20110127145941.23248.92547.stgit@localhost> Signed-off-by: Thomas Gleixner --- arch/cris/arch-v32/kernel/time.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/cris/arch-v32/kernel/time.c b/arch/cris/arch-v32/kernel/time.c index a545211e999d..bb978ede8985 100644 --- a/arch/cris/arch-v32/kernel/time.c +++ b/arch/cris/arch-v32/kernel/time.c @@ -183,7 +183,7 @@ void handle_watchdog_bite(struct pt_regs *regs) /* * timer_interrupt() needs to keep up the real-time clock, - * as well as call the "do_timer()" routine every clocktick. + * as well as call the "xtime_update()" routine every clocktick. */ extern void cris_do_profile(struct pt_regs *regs); @@ -216,9 +216,7 @@ static inline irqreturn_t timer_interrupt(int irq, void *dev_id) return IRQ_HANDLED; /* Call the real timer interrupt handler */ - write_seqlock(&xtime_lock); - do_timer(1); - write_sequnlock(&xtime_lock); + xtime_update(1); return IRQ_HANDLED; } From 57464bd87f708e75b47312766e3fc8dc3aaf66ad Mon Sep 17 00:00:00 2001 From: Torben Hohn Date: Thu, 27 Jan 2011 15:59:46 +0100 Subject: [PATCH 112/706] frv: Switch do_timer() to xtime_update() __set_LEDS() does not need to be protected by xtime_lock. its used unprotected in other places. [ tglx: Removed stale comment ] Signed-off-by: Torben Hohn Cc: hch@infradead.org Cc: Peter Zijlstra Cc: johnstul@us.ibm.com Cc: David Howells Cc: yong.zhang0@gmail.com LKML-Reference: <20110127145946.23248.57952.stgit@localhost> Signed-off-by: Thomas Gleixner --- arch/frv/kernel/time.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/arch/frv/kernel/time.c b/arch/frv/kernel/time.c index 0ddbbae83cb2..b457de496b70 100644 --- a/arch/frv/kernel/time.c +++ b/arch/frv/kernel/time.c @@ -50,21 +50,13 @@ static struct irqaction timer_irq = { /* * timer_interrupt() needs to keep up the real-time clock, - * as well as call the "do_timer()" routine every clocktick + * as well as call the "xtime_update()" routine every clocktick */ static irqreturn_t timer_interrupt(int irq, void *dummy) { profile_tick(CPU_PROFILING); - /* - * Here we are in the timer irq handler. We just have irqs locally - * disabled but we don't know if the timer_bh is running on the other - * CPU. We need to avoid to SMP race with it. NOTE: we don't need - * the irq version of write_lock because as just said we have irq - * locally disabled. -arca - */ - write_seqlock(&xtime_lock); - do_timer(1); + xtime_update(1); #ifdef CONFIG_HEARTBEAT static unsigned short n; @@ -72,8 +64,6 @@ static irqreturn_t timer_interrupt(int irq, void *dummy) __set_LEDS(n); #endif /* CONFIG_HEARTBEAT */ - write_sequnlock(&xtime_lock); - update_process_times(user_mode(get_irq_regs())); return IRQ_HANDLED; From daad8b581e7f5e21a2f79e49d57d4f6a73b26510 Mon Sep 17 00:00:00 2001 From: Torben Hohn Date: Thu, 27 Jan 2011 15:59:51 +0100 Subject: [PATCH 113/706] h8300: Switch do_timer() to xtime_update() xtime_update() takes the xtime_lock itself. Signed-off-by: Torben Hohn Cc: Yoshinori Sato Cc: Peter Zijlstra Cc: johnstul@us.ibm.com Cc: hch@infradead.org Cc: yong.zhang0@gmail.com LKML-Reference: <20110127145951.23248.92727.stgit@localhost> Signed-off-by: Thomas Gleixner --- arch/h8300/kernel/time.c | 4 +--- arch/h8300/kernel/timer/timer8.c | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/h8300/kernel/time.c b/arch/h8300/kernel/time.c index 165005aff9df..32263a138aa6 100644 --- a/arch/h8300/kernel/time.c +++ b/arch/h8300/kernel/time.c @@ -35,9 +35,7 @@ void h8300_timer_tick(void) { if (current->pid) profile_tick(CPU_PROFILING); - write_seqlock(&xtime_lock); - do_timer(1); - write_sequnlock(&xtime_lock); + xtime_update(1); update_process_times(user_mode(get_irq_regs())); } diff --git a/arch/h8300/kernel/timer/timer8.c b/arch/h8300/kernel/timer/timer8.c index 3946c0fa8374..7a1533fad47d 100644 --- a/arch/h8300/kernel/timer/timer8.c +++ b/arch/h8300/kernel/timer/timer8.c @@ -61,7 +61,7 @@ /* * timer_interrupt() needs to keep up the real-time clock, - * as well as call the "do_timer()" routine every clocktick + * as well as call the "xtime_update()" routine every clocktick */ static irqreturn_t timer_interrupt(int irq, void *dev_id) From 1aabd67d2e97e6affdf5a7c65f442ac91ace3f85 Mon Sep 17 00:00:00 2001 From: Torben Hohn Date: Thu, 27 Jan 2011 15:59:56 +0100 Subject: [PATCH 114/706] ia64: Switch do_timer() to xtime_update() local_cpu_data->itm_next = new_itm; does not need to be protected by xtime_lock. xtime_update() takes the lock itself. Signed-off-by: Torben Hohn Cc: Fenghua Yu Cc: Tony Luck Cc: Peter Zijlstra Cc: johnstul@us.ibm.com Cc: hch@infradead.org Cc: yong.zhang0@gmail.com LKML-Reference: <20110127145956.23248.49107.stgit@localhost> Signed-off-by: Thomas Gleixner --- arch/ia64/kernel/time.c | 19 +++++-------------- arch/ia64/xen/time.c | 13 +++++-------- 2 files changed, 10 insertions(+), 22 deletions(-) diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index 9702fa92489e..156ad803d5b7 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -190,19 +190,10 @@ timer_interrupt (int irq, void *dev_id) new_itm += local_cpu_data->itm_delta; - if (smp_processor_id() == time_keeper_id) { - /* - * Here we are in the timer irq handler. We have irqs locally - * disabled, but we don't know if the timer_bh is running on - * another CPU. We need to avoid to SMP race by acquiring the - * xtime_lock. - */ - write_seqlock(&xtime_lock); - do_timer(1); - local_cpu_data->itm_next = new_itm; - write_sequnlock(&xtime_lock); - } else - local_cpu_data->itm_next = new_itm; + if (smp_processor_id() == time_keeper_id) + xtime_update(1); + + local_cpu_data->itm_next = new_itm; if (time_after(new_itm, ia64_get_itc())) break; @@ -222,7 +213,7 @@ skip_process_time_accounting: * comfort, we increase the safety margin by * intentionally dropping the next tick(s). We do NOT * update itm.next because that would force us to call - * do_timer() which in turn would let our clock run + * xtime_update() which in turn would let our clock run * too fast (with the potentially devastating effect * of losing monotony of time). */ diff --git a/arch/ia64/xen/time.c b/arch/ia64/xen/time.c index c1c544513e8d..1f8244a78bee 100644 --- a/arch/ia64/xen/time.c +++ b/arch/ia64/xen/time.c @@ -139,14 +139,11 @@ consider_steal_time(unsigned long new_itm) run_posix_cpu_timers(p); delta_itm += local_cpu_data->itm_delta * (stolen + blocked); - if (cpu == time_keeper_id) { - write_seqlock(&xtime_lock); - do_timer(stolen + blocked); - local_cpu_data->itm_next = delta_itm + new_itm; - write_sequnlock(&xtime_lock); - } else { - local_cpu_data->itm_next = delta_itm + new_itm; - } + if (cpu == time_keeper_id) + xtime_update(stolen + blocked); + + local_cpu_data->itm_next = delta_itm + new_itm; + per_cpu(xen_stolen_time, cpu) += NS_PER_TICK * stolen; per_cpu(xen_blocked_time, cpu) += NS_PER_TICK * blocked; } From 7bde2ab7cb51f14c6f6574f0f5a78445f2caed3e Mon Sep 17 00:00:00 2001 From: Torben Hohn Date: Thu, 27 Jan 2011 16:00:01 +0100 Subject: [PATCH 115/706] m32r: Switch from do_timer() to xtime_update() xtime_update() does proper locking. Signed-off-by: Torben Hohn Cc: Peter Zijlstra Cc: johnstul@us.ibm.com Cc: Hirokazu Takata Cc: hch@infradead.org Cc: yong.zhang0@gmail.com LKML-Reference: <20110127150001.23248.68620.stgit@localhost> Signed-off-by: Thomas Gleixner --- arch/m32r/kernel/time.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/m32r/kernel/time.c b/arch/m32r/kernel/time.c index bda86820bffd..84dd04048db9 100644 --- a/arch/m32r/kernel/time.c +++ b/arch/m32r/kernel/time.c @@ -107,15 +107,14 @@ u32 arch_gettimeoffset(void) /* * timer_interrupt() needs to keep up the real-time clock, - * as well as call the "do_timer()" routine every clocktick + * as well as call the "xtime_update()" routine every clocktick */ static irqreturn_t timer_interrupt(int irq, void *dev_id) { #ifndef CONFIG_SMP profile_tick(CPU_PROFILING); #endif - /* XXX FIXME. Uh, the xtime_lock should be held here, no? */ - do_timer(1); + xtime_update(1); #ifndef CONFIG_SMP update_process_times(user_mode(get_irq_regs())); From e53f276beb655c711a5d1f25f800b61aa976e34f Mon Sep 17 00:00:00 2001 From: Torben Hohn Date: Thu, 27 Jan 2011 16:00:06 +0100 Subject: [PATCH 116/706] m68k: Switch do_timer() to xtime_update() xtime_update() properly takes the xtime_lock Signed-off-by: Torben Hohn Cc: Sam Creasey Cc: Peter Zijlstra Cc: johnstul@us.ibm.com Cc: Roman Zippel Cc: hch@infradead.org Cc: yong.zhang0@gmail.com Cc: Geert Uytterhoeven Cc: Greg Ungerer LKML-Reference: <20110127150006.23248.71790.stgit@localhost> Signed-off-by: Thomas Gleixner --- arch/m68k/bvme6000/config.c | 4 ++-- arch/m68k/kernel/time.c | 4 ++-- arch/m68k/mvme147/config.c | 4 ++-- arch/m68k/mvme16x/config.c | 4 ++-- arch/m68k/sun3/sun3ints.c | 2 +- arch/m68knommu/kernel/time.c | 8 ++------ 6 files changed, 11 insertions(+), 15 deletions(-) diff --git a/arch/m68k/bvme6000/config.c b/arch/m68k/bvme6000/config.c index 9fe6fefb5e14..1edd95095cb4 100644 --- a/arch/m68k/bvme6000/config.c +++ b/arch/m68k/bvme6000/config.c @@ -45,8 +45,8 @@ extern int bvme6000_set_clock_mmss (unsigned long); extern void bvme6000_reset (void); void bvme6000_set_vectors (void); -/* Save tick handler routine pointer, will point to do_timer() in - * kernel/sched.c, called via bvme6000_process_int() */ +/* Save tick handler routine pointer, will point to xtime_update() in + * kernel/timer/timekeeping.c, called via bvme6000_process_int() */ static irq_handler_t tick_handler; diff --git a/arch/m68k/kernel/time.c b/arch/m68k/kernel/time.c index 06438dac08ff..18b34ee5db3b 100644 --- a/arch/m68k/kernel/time.c +++ b/arch/m68k/kernel/time.c @@ -37,11 +37,11 @@ static inline int set_rtc_mmss(unsigned long nowtime) /* * timer_interrupt() needs to keep up the real-time clock, - * as well as call the "do_timer()" routine every clocktick + * as well as call the "xtime_update()" routine every clocktick */ static irqreturn_t timer_interrupt(int irq, void *dummy) { - do_timer(1); + xtime_update(1); update_process_times(user_mode(get_irq_regs())); profile_tick(CPU_PROFILING); diff --git a/arch/m68k/mvme147/config.c b/arch/m68k/mvme147/config.c index 100baaa692a1..6cb9c3a9b6c9 100644 --- a/arch/m68k/mvme147/config.c +++ b/arch/m68k/mvme147/config.c @@ -46,8 +46,8 @@ extern void mvme147_reset (void); static int bcd2int (unsigned char b); -/* Save tick handler routine pointer, will point to do_timer() in - * kernel/sched.c, called via mvme147_process_int() */ +/* Save tick handler routine pointer, will point to xtime_update() in + * kernel/time/timekeeping.c, called via mvme147_process_int() */ irq_handler_t tick_handler; diff --git a/arch/m68k/mvme16x/config.c b/arch/m68k/mvme16x/config.c index 11edf61cc2c4..0b28e2621653 100644 --- a/arch/m68k/mvme16x/config.c +++ b/arch/m68k/mvme16x/config.c @@ -51,8 +51,8 @@ extern void mvme16x_reset (void); int bcd2int (unsigned char b); -/* Save tick handler routine pointer, will point to do_timer() in - * kernel/sched.c, called via mvme16x_process_int() */ +/* Save tick handler routine pointer, will point to xtime_update() in + * kernel/time/timekeeping.c, called via mvme16x_process_int() */ static irq_handler_t tick_handler; diff --git a/arch/m68k/sun3/sun3ints.c b/arch/m68k/sun3/sun3ints.c index 2d9e21bd313a..6464ad3ae3e6 100644 --- a/arch/m68k/sun3/sun3ints.c +++ b/arch/m68k/sun3/sun3ints.c @@ -66,7 +66,7 @@ static irqreturn_t sun3_int5(int irq, void *dev_id) #ifdef CONFIG_SUN3 intersil_clear(); #endif - do_timer(1); + xtime_update(1); update_process_times(user_mode(get_irq_regs())); if (!(kstat_cpu(0).irqs[irq] % 20)) sun3_leds(led_pattern[(kstat_cpu(0).irqs[irq] % 160) / 20]); diff --git a/arch/m68knommu/kernel/time.c b/arch/m68knommu/kernel/time.c index d6ac2a43453c..6623909f70e6 100644 --- a/arch/m68knommu/kernel/time.c +++ b/arch/m68knommu/kernel/time.c @@ -36,7 +36,7 @@ static inline int set_rtc_mmss(unsigned long nowtime) #ifndef CONFIG_GENERIC_CLOCKEVENTS /* * timer_interrupt() needs to keep up the real-time clock, - * as well as call the "do_timer()" routine every clocktick + * as well as call the "xtime_update()" routine every clocktick */ irqreturn_t arch_timer_interrupt(int irq, void *dummy) { @@ -44,11 +44,7 @@ irqreturn_t arch_timer_interrupt(int irq, void *dummy) if (current->pid) profile_tick(CPU_PROFILING); - write_seqlock(&xtime_lock); - - do_timer(1); - - write_sequnlock(&xtime_lock); + xtime_update(1); update_process_times(user_mode(get_irq_regs())); From bb1dfc1cf6c51ca42f7c05029a6f06df9092a0fc Mon Sep 17 00:00:00 2001 From: Torben Hohn Date: Thu, 27 Jan 2011 16:00:17 +0100 Subject: [PATCH 117/706] parisc: Switch do_timer() to xtime_update() xtime_update() takes the xtime_lock itself. Signed-off-by: Torben Hohn Cc: hch@infradead.org Cc: Peter Zijlstra Cc: johnstul@us.ibm.com Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Kyle McMartin Cc: yong.zhang0@gmail.com LKML-Reference: <20110127150017.23248.22559.stgit@localhost> Signed-off-by: Thomas Gleixner --- arch/parisc/kernel/time.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c index 05511ccb61d2..45b7389d77aa 100644 --- a/arch/parisc/kernel/time.c +++ b/arch/parisc/kernel/time.c @@ -162,11 +162,8 @@ irqreturn_t __irq_entry timer_interrupt(int irq, void *dev_id) update_process_times(user_mode(get_irq_regs())); } - if (cpu == 0) { - write_seqlock(&xtime_lock); - do_timer(ticks_elapsed); - write_sequnlock(&xtime_lock); - } + if (cpu == 0) + xtime_update(ticks_elapsed); return IRQ_HANDLED; } From 4ea1b72551d052a3993ef72ce06ecf5bdd125859 Mon Sep 17 00:00:00 2001 From: Torben Hohn Date: Thu, 27 Jan 2011 16:00:22 +0100 Subject: [PATCH 118/706] sparc: Switch do_timer() to xtime_update() xtime_update() takes the xtime_lock itself. pcic_clear_clock_irq() and clear_clock_irq do not need to be protected by xtime_lock. Signed-off-by: Torben Hohn Acked-by: David S. Miller Cc: Peter Zijlstra Cc: johnstul@us.ibm.com Cc: hch@infradead.org Cc: yong.zhang0@gmail.com LKML-Reference: <20110127150022.23248.80369.stgit@localhost> Signed-off-by: Thomas Gleixner --- arch/sparc/kernel/pcic.c | 4 +--- arch/sparc/kernel/time_32.c | 9 ++------- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/arch/sparc/kernel/pcic.c b/arch/sparc/kernel/pcic.c index aeaa09a3c655..2cdc131b50ac 100644 --- a/arch/sparc/kernel/pcic.c +++ b/arch/sparc/kernel/pcic.c @@ -700,10 +700,8 @@ static void pcic_clear_clock_irq(void) static irqreturn_t pcic_timer_handler (int irq, void *h) { - write_seqlock(&xtime_lock); /* Dummy, to show that we remember */ pcic_clear_clock_irq(); - do_timer(1); - write_sequnlock(&xtime_lock); + xtime_update(1); #ifndef CONFIG_SMP update_process_times(user_mode(get_irq_regs())); #endif diff --git a/arch/sparc/kernel/time_32.c b/arch/sparc/kernel/time_32.c index 9c743b1886ff..4211bfc9bcad 100644 --- a/arch/sparc/kernel/time_32.c +++ b/arch/sparc/kernel/time_32.c @@ -85,7 +85,7 @@ int update_persistent_clock(struct timespec now) /* * timer_interrupt() needs to keep up the real-time clock, - * as well as call the "do_timer()" routine every clocktick + * as well as call the "xtime_update()" routine every clocktick */ #define TICK_SIZE (tick_nsec / 1000) @@ -96,14 +96,9 @@ static irqreturn_t timer_interrupt(int dummy, void *dev_id) profile_tick(CPU_PROFILING); #endif - /* Protect counter clear so that do_gettimeoffset works */ - write_seqlock(&xtime_lock); - clear_clock_irq(); - do_timer(1); - - write_sequnlock(&xtime_lock); + xtime_update(1); #ifndef CONFIG_SMP update_process_times(user_mode(get_irq_regs())); From d12b0e24c56c6fb2398609f26858e5278d688840 Mon Sep 17 00:00:00 2001 From: Torben Hohn Date: Thu, 27 Jan 2011 16:00:27 +0100 Subject: [PATCH 119/706] xtensa: Switch do_timer() to xtime_update() xtime_update() takes the xtime_lock itself. set_linux_timer() does not need to be protected by xtime_lock. [ tglx: This code is broken on SMP anyway. ] Signed-off-by: Torben Hohn Cc: Chris Zankel Cc: Peter Zijlstra Cc: johnstul@us.ibm.com Cc: hch@infradead.org Cc: yong.zhang0@gmail.com LKML-Reference: <20110127150027.23248.61798.stgit@localhost> Signed-off-by: Thomas Gleixner --- arch/xtensa/kernel/time.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c index 19df764f6399..f3e5eb43f71c 100644 --- a/arch/xtensa/kernel/time.c +++ b/arch/xtensa/kernel/time.c @@ -96,16 +96,12 @@ again: update_process_times(user_mode(get_irq_regs())); #endif - write_seqlock(&xtime_lock); - - do_timer(1); /* Linux handler in kernel/timer.c */ + xtime_update(1); /* Linux handler in kernel/time/timekeeping */ /* Note that writing CCOMPARE clears the interrupt. */ next += CCOUNT_PER_JIFFY; set_linux_timer(next); - - write_sequnlock(&xtime_lock); } /* Allow platform to do something useful (Wdog). */ From 7e2ed097538c57ff5268e9a6bced7c0b885809c8 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 30 Jan 2011 11:59:43 -0200 Subject: [PATCH 120/706] perf evlist: Store pointer to the cpu and thread maps So that we don't have to pass it around to the several methods that needs it, simplifying usage. There is one case where we don't have the thread/cpu map in advance, which is in the parsing routines used by top, stat, record, that we have to wait till all options are parsed to know if a cpu or thread list was passed to then create those maps. For that case consolidate the cpu and thread map creation via perf_evlist__create_maps() out of the code in top and record, while also providing a perf_evlist__set_maps() for cases where multiple evlists share maps or for when maps that represent CPU sockets, for instance, get crafted out of topology information or subsets of threads in a particular application are to be monitored, providing more granularity in specifying which cpus and threads to monitor. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 44 +++++++++--------------- tools/perf/builtin-stat.c | 45 ++++++++++++------------- tools/perf/builtin-test.c | 6 ++-- tools/perf/builtin-top.c | 47 +++++++++++--------------- tools/perf/python/twatch.py | 4 +-- tools/perf/util/evlist.c | 67 +++++++++++++++++++++++++++---------- tools/perf/util/evlist.h | 29 ++++++++++++---- tools/perf/util/python.c | 25 ++++++++------ 8 files changed, 148 insertions(+), 119 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index edc3555098c8..07f8d6d852c2 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -42,7 +42,6 @@ static u64 user_interval = ULLONG_MAX; static u64 default_interval = 0; static u64 sample_type; -static struct cpu_map *cpus; static unsigned int page_size; static unsigned int mmap_pages = 128; static unsigned int user_freq = UINT_MAX; @@ -58,7 +57,6 @@ static bool sample_id_all_avail = true; static bool system_wide = false; static pid_t target_pid = -1; static pid_t target_tid = -1; -static struct thread_map *threads; static pid_t child_pid = -1; static bool no_inherit = false; static enum write_mode_t write_mode = WRITE_FORCE; @@ -189,7 +187,7 @@ static void create_counter(struct perf_evsel *evsel, int cpu) int thread_index; int ret; - for (thread_index = 0; thread_index < threads->nr; thread_index++) { + for (thread_index = 0; thread_index < evsel_list->threads->nr; thread_index++) { h_attr = get_header_attr(attr, evsel->idx); if (h_attr == NULL) die("nomem\n"); @@ -317,7 +315,8 @@ static void open_counters(struct perf_evlist *evlist) retry_sample_id: attr->sample_id_all = sample_id_all_avail ? 1 : 0; try_again: - if (perf_evsel__open(pos, cpus, threads, group, !no_inherit) < 0) { + if (perf_evsel__open(pos, evlist->cpus, evlist->threads, group, + !no_inherit) < 0) { int err = errno; if (err == EPERM || err == EACCES) @@ -368,10 +367,10 @@ try_again: } } - if (perf_evlist__mmap(evlist, cpus, threads, mmap_pages, false) < 0) + if (perf_evlist__mmap(evlist, mmap_pages, false) < 0) die("failed to mmap with %d (%s)\n", errno, strerror(errno)); - for (cpu = 0; cpu < cpus->nr; ++cpu) { + for (cpu = 0; cpu < evsel_list->cpus->nr; ++cpu) { list_for_each_entry(pos, &evlist->entries, node) create_counter(pos, cpu); } @@ -450,7 +449,7 @@ static void mmap_read_all(void) { int i; - for (i = 0; i < cpus->nr; i++) { + for (i = 0; i < evsel_list->cpus->nr; i++) { if (evsel_list->mmap[i].base) mmap_read(&evsel_list->mmap[i]); } @@ -584,7 +583,7 @@ static int __cmd_record(int argc, const char **argv) } if (!system_wide && target_tid == -1 && target_pid == -1) - threads->map[0] = child_pid; + evsel_list->threads->map[0] = child_pid; close(child_ready_pipe[1]); close(go_pipe[0]); @@ -718,12 +717,12 @@ static int __cmd_record(int argc, const char **argv) } if (done) { - for (i = 0; i < cpus->nr; i++) { + for (i = 0; i < evsel_list->cpus->nr; i++) { struct perf_evsel *pos; list_for_each_entry(pos, &evsel_list->entries, node) { for (thread = 0; - thread < threads->nr; + thread < evsel_list->threads->nr; thread++) ioctl(FD(pos, i, thread), PERF_EVENT_IOC_DISABLE); @@ -816,7 +815,7 @@ int cmd_record(int argc, const char **argv, const char *prefix __used) int err = -ENOMEM; struct perf_evsel *pos; - evsel_list = perf_evlist__new(); + evsel_list = perf_evlist__new(NULL, NULL); if (evsel_list == NULL) return -ENOMEM; @@ -850,28 +849,19 @@ int cmd_record(int argc, const char **argv, const char *prefix __used) if (target_pid != -1) target_tid = target_pid; - threads = thread_map__new(target_pid, target_tid); - if (threads == NULL) { - pr_err("Problems finding threads of monitor\n"); - usage_with_options(record_usage, record_options); - } - - if (target_tid != -1) - cpus = cpu_map__dummy_new(); - else - cpus = cpu_map__new(cpu_list); - - if (cpus == NULL) + if (perf_evlist__create_maps(evsel_list, target_pid, + target_tid, cpu_list) < 0) usage_with_options(record_usage, record_options); list_for_each_entry(pos, &evsel_list->entries, node) { - if (perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0) + if (perf_evsel__alloc_fd(pos, evsel_list->cpus->nr, + evsel_list->threads->nr) < 0) goto out_free_fd; if (perf_header__push_event(pos->attr.config, event_name(pos))) goto out_free_fd; } - if (perf_evlist__alloc_pollfd(evsel_list, cpus->nr, threads->nr) < 0) + if (perf_evlist__alloc_pollfd(evsel_list) < 0) goto out_free_fd; if (user_interval != ULLONG_MAX) @@ -893,10 +883,8 @@ int cmd_record(int argc, const char **argv, const char *prefix __used) } err = __cmd_record(argc, argv); - out_free_fd: - thread_map__delete(threads); - threads = NULL; + perf_evlist__delete_maps(evsel_list); out_symbol_exit: symbol__exit(); return err; diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 8906adfdbd8e..e0f95755361b 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -76,7 +76,6 @@ static struct perf_event_attr default_attrs[] = { struct perf_evlist *evsel_list; static bool system_wide = false; -static struct cpu_map *cpus; static int run_idx = 0; static int run_count = 1; @@ -85,7 +84,6 @@ static bool scale = true; static bool no_aggr = false; static pid_t target_pid = -1; static pid_t target_tid = -1; -static struct thread_map *threads; static pid_t child_pid = -1; static bool null_run = false; static bool big_num = true; @@ -170,7 +168,7 @@ static int create_perf_stat_counter(struct perf_evsel *evsel) PERF_FORMAT_TOTAL_TIME_RUNNING; if (system_wide) - return perf_evsel__open_per_cpu(evsel, cpus, false, false); + return perf_evsel__open_per_cpu(evsel, evsel_list->cpus, false, false); attr->inherit = !no_inherit; if (target_pid == -1 && target_tid == -1) { @@ -178,7 +176,7 @@ static int create_perf_stat_counter(struct perf_evsel *evsel) attr->enable_on_exec = 1; } - return perf_evsel__open_per_thread(evsel, threads, false, false); + return perf_evsel__open_per_thread(evsel, evsel_list->threads, false, false); } /* @@ -203,7 +201,8 @@ static int read_counter_aggr(struct perf_evsel *counter) u64 *count = counter->counts->aggr.values; int i; - if (__perf_evsel__read(counter, cpus->nr, threads->nr, scale) < 0) + if (__perf_evsel__read(counter, evsel_list->cpus->nr, + evsel_list->threads->nr, scale) < 0) return -1; for (i = 0; i < 3; i++) @@ -236,7 +235,7 @@ static int read_counter(struct perf_evsel *counter) u64 *count; int cpu; - for (cpu = 0; cpu < cpus->nr; cpu++) { + for (cpu = 0; cpu < evsel_list->cpus->nr; cpu++) { if (__perf_evsel__read_on_cpu(counter, cpu, 0, scale) < 0) return -1; @@ -301,7 +300,7 @@ static int run_perf_stat(int argc __used, const char **argv) } if (target_tid == -1 && target_pid == -1 && !system_wide) - threads->map[0] = child_pid; + evsel_list->threads->map[0] = child_pid; /* * Wait for the child to be ready to exec. @@ -353,12 +352,13 @@ static int run_perf_stat(int argc __used, const char **argv) if (no_aggr) { list_for_each_entry(counter, &evsel_list->entries, node) { read_counter(counter); - perf_evsel__close_fd(counter, cpus->nr, 1); + perf_evsel__close_fd(counter, evsel_list->cpus->nr, 1); } } else { list_for_each_entry(counter, &evsel_list->entries, node) { read_counter_aggr(counter); - perf_evsel__close_fd(counter, cpus->nr, threads->nr); + perf_evsel__close_fd(counter, evsel_list->cpus->nr, + evsel_list->threads->nr); } } @@ -386,7 +386,7 @@ static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg) if (no_aggr) sprintf(cpustr, "CPU%*d%s", csv_output ? 0 : -4, - cpus->map[cpu], csv_sep); + evsel_list->cpus->map[cpu], csv_sep); fprintf(stderr, fmt, cpustr, msecs, csv_sep, event_name(evsel)); @@ -414,7 +414,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) if (no_aggr) sprintf(cpustr, "CPU%*d%s", csv_output ? 0 : -4, - cpus->map[cpu], csv_sep); + evsel_list->cpus->map[cpu], csv_sep); else cpu = 0; @@ -500,14 +500,14 @@ static void print_counter(struct perf_evsel *counter) u64 ena, run, val; int cpu; - for (cpu = 0; cpu < cpus->nr; cpu++) { + for (cpu = 0; cpu < evsel_list->cpus->nr; cpu++) { val = counter->counts->cpu[cpu].val; ena = counter->counts->cpu[cpu].ena; run = counter->counts->cpu[cpu].run; if (run == 0 || ena == 0) { fprintf(stderr, "CPU%*d%s%*s%s%-24s", csv_output ? 0 : -4, - cpus->map[cpu], csv_sep, + evsel_list->cpus->map[cpu], csv_sep, csv_output ? 0 : 18, "", csv_sep, event_name(counter)); @@ -652,7 +652,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used) setlocale(LC_ALL, ""); - evsel_list = perf_evlist__new(); + evsel_list = perf_evlist__new(NULL, NULL); if (evsel_list == NULL) return -ENOMEM; @@ -701,18 +701,18 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used) if (target_pid != -1) target_tid = target_pid; - threads = thread_map__new(target_pid, target_tid); - if (threads == NULL) { + evsel_list->threads = thread_map__new(target_pid, target_tid); + if (evsel_list->threads == NULL) { pr_err("Problems finding threads of monitor\n"); usage_with_options(stat_usage, options); } if (system_wide) - cpus = cpu_map__new(cpu_list); + evsel_list->cpus = cpu_map__new(cpu_list); else - cpus = cpu_map__dummy_new(); + evsel_list->cpus = cpu_map__dummy_new(); - if (cpus == NULL) { + if (evsel_list->cpus == NULL) { perror("failed to parse CPUs map"); usage_with_options(stat_usage, options); return -1; @@ -720,8 +720,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used) list_for_each_entry(pos, &evsel_list->entries, node) { if (perf_evsel__alloc_stat_priv(pos) < 0 || - perf_evsel__alloc_counts(pos, cpus->nr) < 0 || - perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0) + perf_evsel__alloc_counts(pos, evsel_list->cpus->nr) < 0 || + perf_evsel__alloc_fd(pos, evsel_list->cpus->nr, evsel_list->threads->nr) < 0) goto out_free_fd; } @@ -750,7 +750,6 @@ out_free_fd: perf_evsel__free_stat_priv(pos); perf_evlist__delete(evsel_list); out: - thread_map__delete(threads); - threads = NULL; + perf_evlist__delete_maps(evsel_list); return status; } diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c index 845b9bd54ed4..1b2106c58f66 100644 --- a/tools/perf/builtin-test.c +++ b/tools/perf/builtin-test.c @@ -509,7 +509,7 @@ static int test__basic_mmap(void) goto out_free_cpus; } - evlist = perf_evlist__new(); + evlist = perf_evlist__new(cpus, threads); if (evlist == NULL) { pr_debug("perf_evlist__new\n"); goto out_free_cpus; @@ -537,7 +537,7 @@ static int test__basic_mmap(void) } } - if (perf_evlist__mmap(evlist, cpus, threads, 128, true) < 0) { + if (perf_evlist__mmap(evlist, 128, true) < 0) { pr_debug("failed to mmap events: %d (%s)\n", errno, strerror(errno)); goto out_close_fd; @@ -579,7 +579,7 @@ static int test__basic_mmap(void) err = 0; out_munmap: - perf_evlist__munmap(evlist, 1); + perf_evlist__munmap(evlist); out_close_fd: for (i = 0; i < nsyscalls; ++i) perf_evsel__close_fd(evsels[i], 1, threads->nr); diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 2f4d1f244be1..599036b06730 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -73,9 +73,7 @@ static int print_entries; static int target_pid = -1; static int target_tid = -1; -static struct thread_map *threads; static bool inherit = false; -static struct cpu_map *cpus; static int realtime_prio = 0; static bool group = false; static unsigned int page_size; @@ -567,12 +565,13 @@ static void print_sym_table(struct perf_session *session) printf(" (all"); if (cpu_list) - printf(", CPU%s: %s)\n", cpus->nr > 1 ? "s" : "", cpu_list); + printf(", CPU%s: %s)\n", evsel_list->cpus->nr > 1 ? "s" : "", cpu_list); else { if (target_tid != -1) printf(")\n"); else - printf(", %d CPU%s)\n", cpus->nr, cpus->nr > 1 ? "s" : ""); + printf(", %d CPU%s)\n", evsel_list->cpus->nr, + evsel_list->cpus->nr > 1 ? "s" : ""); } printf("%-*.*s\n", win_width, win_width, graph_dotted_line); @@ -1124,7 +1123,7 @@ static void perf_session__mmap_read(struct perf_session *self) { int i; - for (i = 0; i < cpus->nr; i++) + for (i = 0; i < evsel_list->cpus->nr; i++) perf_session__mmap_read_cpu(self, i); } @@ -1150,7 +1149,8 @@ static void start_counters(struct perf_evlist *evlist) attr->mmap = 1; try_again: - if (perf_evsel__open(counter, cpus, threads, group, inherit) < 0) { + if (perf_evsel__open(counter, evsel_list->cpus, + evsel_list->threads, group, inherit) < 0) { int err = errno; if (err == EPERM || err == EACCES) @@ -1181,7 +1181,7 @@ try_again: } } - if (perf_evlist__mmap(evlist, cpus, threads, mmap_pages, false) < 0) + if (perf_evlist__mmap(evlist, mmap_pages, false) < 0) die("failed to mmap with %d (%s)\n", errno, strerror(errno)); } @@ -1296,7 +1296,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) struct perf_evsel *pos; int status = -ENOMEM; - evsel_list = perf_evlist__new(); + evsel_list = perf_evlist__new(NULL, NULL); if (evsel_list == NULL) return -ENOMEM; @@ -1306,15 +1306,6 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) if (argc) usage_with_options(top_usage, options); - if (target_pid != -1) - target_tid = target_pid; - - threads = thread_map__new(target_pid, target_tid); - if (threads == NULL) { - pr_err("Problems finding threads of monitor\n"); - usage_with_options(top_usage, options); - } - /* CPU and PID are mutually exclusive */ if (target_tid > 0 && cpu_list) { printf("WARNING: PID switch overriding CPU\n"); @@ -1322,6 +1313,13 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) cpu_list = NULL; } + if (target_pid != -1) + target_tid = target_pid; + + if (perf_evlist__create_maps(evsel_list, target_pid, + target_tid, cpu_list) < 0) + usage_with_options(top_usage, options); + if (!evsel_list->nr_entries && perf_evlist__add_default(evsel_list) < 0) { pr_err("Not enough memory for event selector list\n"); @@ -1343,16 +1341,9 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) exit(EXIT_FAILURE); } - if (target_tid != -1) - cpus = cpu_map__dummy_new(); - else - cpus = cpu_map__new(cpu_list); - - if (cpus == NULL) - usage_with_options(top_usage, options); - list_for_each_entry(pos, &evsel_list->entries, node) { - if (perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0) + if (perf_evsel__alloc_fd(pos, evsel_list->cpus->nr, + evsel_list->threads->nr) < 0) goto out_free_fd; /* * Fill in the ones not specifically initialized via -c: @@ -1363,8 +1354,8 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) pos->attr.sample_period = default_interval; } - if (perf_evlist__alloc_pollfd(evsel_list, cpus->nr, threads->nr) < 0 || - perf_evlist__alloc_mmap(evsel_list, cpus->nr) < 0) + if (perf_evlist__alloc_pollfd(evsel_list) < 0 || + perf_evlist__alloc_mmap(evsel_list) < 0) goto out_free_fd; sym_evsel = list_entry(evsel_list->entries.next, struct perf_evsel, node); diff --git a/tools/perf/python/twatch.py b/tools/perf/python/twatch.py index 5e9f3b7b7ee8..df638c438a9f 100755 --- a/tools/perf/python/twatch.py +++ b/tools/perf/python/twatch.py @@ -23,9 +23,9 @@ def main(): sample_id_all = 1, sample_type = perf.SAMPLE_PERIOD | perf.SAMPLE_TID | perf.SAMPLE_CPU | perf.SAMPLE_TID) evsel.open(cpus = cpus, threads = threads); - evlist = perf.evlist() + evlist = perf.evlist(cpus, threads) evlist.add(evsel) - evlist.mmap(cpus = cpus, threads = threads) + evlist.mmap() while True: evlist.poll(timeout = -1) for cpu in cpus: diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index dcd59328bb49..95b21fece2ce 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -21,21 +21,24 @@ #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y)) #define SID(e, x, y) xyarray__entry(e->id, x, y) -void perf_evlist__init(struct perf_evlist *evlist) +void perf_evlist__init(struct perf_evlist *evlist, struct cpu_map *cpus, + struct thread_map *threads) { int i; for (i = 0; i < PERF_EVLIST__HLIST_SIZE; ++i) INIT_HLIST_HEAD(&evlist->heads[i]); INIT_LIST_HEAD(&evlist->entries); + perf_evlist__set_maps(evlist, cpus, threads); } -struct perf_evlist *perf_evlist__new(void) +struct perf_evlist *perf_evlist__new(struct cpu_map *cpus, + struct thread_map *threads) { struct perf_evlist *evlist = zalloc(sizeof(*evlist)); if (evlist != NULL) - perf_evlist__init(evlist); + perf_evlist__init(evlist, cpus, threads); return evlist; } @@ -88,9 +91,9 @@ int perf_evlist__add_default(struct perf_evlist *evlist) return 0; } -int perf_evlist__alloc_pollfd(struct perf_evlist *evlist, int ncpus, int nthreads) +int perf_evlist__alloc_pollfd(struct perf_evlist *evlist) { - int nfds = ncpus * nthreads * evlist->nr_entries; + int nfds = evlist->cpus->nr * evlist->threads->nr * evlist->nr_entries; evlist->pollfd = malloc(sizeof(struct pollfd) * nfds); return evlist->pollfd != NULL ? 0 : -ENOMEM; } @@ -213,11 +216,11 @@ union perf_event *perf_evlist__read_on_cpu(struct perf_evlist *evlist, int cpu) return event; } -void perf_evlist__munmap(struct perf_evlist *evlist, int ncpus) +void perf_evlist__munmap(struct perf_evlist *evlist) { int cpu; - for (cpu = 0; cpu < ncpus; cpu++) { + for (cpu = 0; cpu < evlist->cpus->nr; cpu++) { if (evlist->mmap[cpu].base != NULL) { munmap(evlist->mmap[cpu].base, evlist->mmap_len); evlist->mmap[cpu].base = NULL; @@ -225,9 +228,9 @@ void perf_evlist__munmap(struct perf_evlist *evlist, int ncpus) } } -int perf_evlist__alloc_mmap(struct perf_evlist *evlist, int ncpus) +int perf_evlist__alloc_mmap(struct perf_evlist *evlist) { - evlist->mmap = zalloc(ncpus * sizeof(struct perf_mmap)); + evlist->mmap = zalloc(evlist->cpus->nr * sizeof(struct perf_mmap)); return evlist->mmap != NULL ? 0 : -ENOMEM; } @@ -248,8 +251,6 @@ static int __perf_evlist__mmap(struct perf_evlist *evlist, int cpu, int prot, /** perf_evlist__mmap - Create per cpu maps to receive events * * @evlist - list of events - * @cpus - cpu map being monitored - * @threads - threads map being monitored * @pages - map length in pages * @overwrite - overwrite older events? * @@ -259,21 +260,22 @@ static int __perf_evlist__mmap(struct perf_evlist *evlist, int cpu, int prot, * unsigned int head = perf_mmap__read_head(m); * * perf_mmap__write_tail(m, head) + * + * Using perf_evlist__read_on_cpu does this automatically. */ -int perf_evlist__mmap(struct perf_evlist *evlist, struct cpu_map *cpus, - struct thread_map *threads, int pages, bool overwrite) +int perf_evlist__mmap(struct perf_evlist *evlist, int pages, bool overwrite) { unsigned int page_size = sysconf(_SC_PAGE_SIZE); int mask = pages * page_size - 1, cpu; struct perf_evsel *first_evsel, *evsel; + const struct cpu_map *cpus = evlist->cpus; + const struct thread_map *threads = evlist->threads; int thread, prot = PROT_READ | (overwrite ? 0 : PROT_WRITE); - if (evlist->mmap == NULL && - perf_evlist__alloc_mmap(evlist, cpus->nr) < 0) + if (evlist->mmap == NULL && perf_evlist__alloc_mmap(evlist) < 0) return -ENOMEM; - if (evlist->pollfd == NULL && - perf_evlist__alloc_pollfd(evlist, cpus->nr, threads->nr) < 0) + if (evlist->pollfd == NULL && perf_evlist__alloc_pollfd(evlist) < 0) return -ENOMEM; evlist->overwrite = overwrite; @@ -315,3 +317,34 @@ out_unmap: } return -1; } + +int perf_evlist__create_maps(struct perf_evlist *evlist, pid_t target_pid, + pid_t target_tid, const char *cpu_list) +{ + evlist->threads = thread_map__new(target_pid, target_tid); + + if (evlist->threads == NULL) + return -1; + + if (target_tid != -1) + evlist->cpus = cpu_map__dummy_new(); + else + evlist->cpus = cpu_map__new(cpu_list); + + if (evlist->cpus == NULL) + goto out_delete_threads; + + return 0; + +out_delete_threads: + thread_map__delete(evlist->threads); + return -1; +} + +void perf_evlist__delete_maps(struct perf_evlist *evlist) +{ + cpu_map__delete(evlist->cpus); + thread_map__delete(evlist->threads); + evlist->cpus = NULL; + evlist->threads = NULL; +} diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 85aca6eba16b..c9884056097c 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -22,28 +22,43 @@ struct perf_evlist { union perf_event event_copy; struct perf_mmap *mmap; struct pollfd *pollfd; + struct thread_map *threads; + struct cpu_map *cpus; }; struct perf_evsel; -struct perf_evlist *perf_evlist__new(void); -void perf_evlist__init(struct perf_evlist *evlist); +struct perf_evlist *perf_evlist__new(struct cpu_map *cpus, + struct thread_map *threads); +void perf_evlist__init(struct perf_evlist *evlist, struct cpu_map *cpus, + struct thread_map *threads); void perf_evlist__exit(struct perf_evlist *evlist); void perf_evlist__delete(struct perf_evlist *evlist); void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry); int perf_evlist__add_default(struct perf_evlist *evlist); -int perf_evlist__alloc_pollfd(struct perf_evlist *evlist, int ncpus, int nthreads); +int perf_evlist__alloc_pollfd(struct perf_evlist *evlist); void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd); struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id); union perf_event *perf_evlist__read_on_cpu(struct perf_evlist *self, int cpu); -int perf_evlist__alloc_mmap(struct perf_evlist *evlist, int ncpus); -int perf_evlist__mmap(struct perf_evlist *evlist, struct cpu_map *cpus, - struct thread_map *threads, int pages, bool overwrite); -void perf_evlist__munmap(struct perf_evlist *evlist, int ncpus); +int perf_evlist__alloc_mmap(struct perf_evlist *evlist); +int perf_evlist__mmap(struct perf_evlist *evlist, int pages, bool overwrite); +void perf_evlist__munmap(struct perf_evlist *evlist); + +static inline void perf_evlist__set_maps(struct perf_evlist *evlist, + struct cpu_map *cpus, + struct thread_map *threads) +{ + evlist->cpus = cpus; + evlist->threads = threads; +} + +int perf_evlist__create_maps(struct perf_evlist *evlist, pid_t target_pid, + pid_t target_tid, const char *cpu_list); +void perf_evlist__delete_maps(struct perf_evlist *evlist); #endif /* __PERF_EVLIST_H */ diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 88d47895a79c..d2d52175362e 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -553,7 +553,16 @@ struct pyrf_evlist { static int pyrf_evlist__init(struct pyrf_evlist *pevlist, PyObject *args, PyObject *kwargs) { - perf_evlist__init(&pevlist->evlist); + PyObject *pcpus = NULL, *pthreads = NULL; + struct cpu_map *cpus; + struct thread_map *threads; + + if (!PyArg_ParseTuple(args, "OO", &pcpus, &pthreads)) + return -1; + + threads = ((struct pyrf_thread_map *)pthreads)->threads; + cpus = ((struct pyrf_cpu_map *)pcpus)->cpus; + perf_evlist__init(&pevlist->evlist, cpus, threads); return 0; } @@ -567,21 +576,15 @@ static PyObject *pyrf_evlist__mmap(struct pyrf_evlist *pevlist, PyObject *args, PyObject *kwargs) { struct perf_evlist *evlist = &pevlist->evlist; - PyObject *pcpus = NULL, *pthreads = NULL; - struct cpu_map *cpus = NULL; - struct thread_map *threads = NULL; - static char *kwlist[] = {"cpus", "threads", "pages", "overwrite", + static char *kwlist[] = {"pages", "overwrite", NULL, NULL}; int pages = 128, overwrite = false; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|ii", kwlist, - &pcpus, &pthreads, &pages, &overwrite)) + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ii", kwlist, + &pages, &overwrite)) return NULL; - threads = ((struct pyrf_thread_map *)pthreads)->threads; - cpus = ((struct pyrf_cpu_map *)pcpus)->cpus; - - if (perf_evlist__mmap(evlist, cpus, threads, pages, overwrite) < 0) { + if (perf_evlist__mmap(evlist, pages, overwrite) < 0) { PyErr_SetFromErrno(PyExc_OSError); return NULL; } From 8c3e10eb1968877d6a1957b7e790c6ce01bd56fc Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 31 Jan 2011 14:50:39 -0200 Subject: [PATCH 121/706] perf top: Move display agnostic routines to util/top.[ch] Paving the way for a slang browser a la 'perf report --tui'. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile | 2 + tools/perf/builtin-top.c | 482 +++++++++++---------------------------- tools/perf/util/top.c | 212 +++++++++++++++++ tools/perf/util/top.h | 67 ++++++ 4 files changed, 418 insertions(+), 345 deletions(-) create mode 100644 tools/perf/util/top.c create mode 100644 tools/perf/util/top.h diff --git a/tools/perf/Makefile b/tools/perf/Makefile index 36ff73c0af94..edc660e0bc2f 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -437,6 +437,7 @@ LIB_H += util/probe-finder.h LIB_H += util/probe-event.h LIB_H += util/pstack.h LIB_H += util/cpumap.h +LIB_H += util/top.h LIB_H += $(ARCH_INCLUDE) LIB_OBJS += $(OUTPUT)util/abspath.o @@ -464,6 +465,7 @@ LIB_OBJS += $(OUTPUT)util/strbuf.o LIB_OBJS += $(OUTPUT)util/string.o LIB_OBJS += $(OUTPUT)util/strlist.o LIB_OBJS += $(OUTPUT)util/strfilter.o +LIB_OBJS += $(OUTPUT)util/top.o LIB_OBJS += $(OUTPUT)util/usage.o LIB_OBJS += $(OUTPUT)util/wrapper.o LIB_OBJS += $(OUTPUT)util/sigchain.o diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 599036b06730..3c9ba943aa48 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -27,6 +27,7 @@ #include "util/symbol.h" #include "util/thread.h" #include "util/thread_map.h" +#include "util/top.h" #include "util/util.h" #include #include "util/parse-options.h" @@ -47,7 +48,6 @@ #include #include #include -#include #include #include @@ -62,75 +62,35 @@ #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y)) -struct perf_evlist *evsel_list; +static struct perf_top top = { + .count_filter = 5, + .delay_secs = 2, + .display_weighted = -1, + .target_pid = -1, + .target_tid = -1, + .active_symbols = LIST_HEAD_INIT(top.active_symbols), + .active_symbols_lock = PTHREAD_MUTEX_INITIALIZER, + .freq = 1000, /* 1 KHz */ +}; static bool system_wide = false; static int default_interval = 0; -static int count_filter = 5; -static int print_entries; - -static int target_pid = -1; -static int target_tid = -1; static bool inherit = false; static int realtime_prio = 0; static bool group = false; static unsigned int page_size; static unsigned int mmap_pages = 128; -static int freq = 1000; /* 1 KHz */ -static int delay_secs = 2; -static bool zero = false; static bool dump_symtab = false; -static bool hide_kernel_symbols = false; -static bool hide_user_symbols = false; static struct winsize winsize; -/* - * Source - */ - -struct source_line { - u64 eip; - unsigned long count[MAX_COUNTERS]; - char *line; - struct source_line *next; -}; - static const char *sym_filter = NULL; struct sym_entry *sym_filter_entry = NULL; struct sym_entry *sym_filter_entry_sched = NULL; static int sym_pcnt_filter = 5; -static int sym_counter = 0; -static struct perf_evsel *sym_evsel = NULL; -static int display_weighted = -1; -static const char *cpu_list; - -/* - * Symbols - */ - -struct sym_entry_source { - struct source_line *source; - struct source_line *lines; - struct source_line **lines_tail; - pthread_mutex_t lock; -}; - -struct sym_entry { - struct rb_node rb_node; - struct list_head node; - unsigned long snap_count; - double weight; - int skip; - u16 name_len; - u8 origin; - struct map *map; - struct sym_entry_source *src; - unsigned long count[0]; -}; /* * Source functions @@ -165,10 +125,10 @@ void get_term_dimensions(struct winsize *ws) static void update_print_entries(struct winsize *ws) { - print_entries = ws->ws_row; + top.print_entries = ws->ws_row; - if (print_entries > 9) - print_entries -= 9; + if (top.print_entries > 9) + top.print_entries -= 9; } static void sig_winch_handler(int sig __used) @@ -269,7 +229,7 @@ static void __zero_source_counters(struct sym_entry *syme) line = syme->src->lines; while (line) { - for (i = 0; i < evsel_list->nr_entries; i++) + for (i = 0; i < top.evlist->nr_entries; i++) line->count[i] = 0; line = line->next; } @@ -331,9 +291,9 @@ static void show_lines(struct source_line *queue, int count, int total) line = queue; for (i = 0; i < count; i++) { - float pcnt = 100.0*(float)line->count[sym_counter]/(float)total; + float pcnt = 100.0*(float)line->count[top.sym_counter]/(float)total; - printf("%8li %4.1f%%\t%s\n", line->count[sym_counter], pcnt, line->line); + printf("%8li %4.1f%%\t%s\n", line->count[top.sym_counter], pcnt, line->line); line = line->next; } } @@ -358,13 +318,13 @@ static void show_details(struct sym_entry *syme) return; symbol = sym_entry__symbol(syme); - printf("Showing %s for %s\n", event_name(sym_evsel), symbol->name); + printf("Showing %s for %s\n", event_name(top.sym_evsel), symbol->name); printf(" Events Pcnt (>=%d%%)\n", sym_pcnt_filter); pthread_mutex_lock(&syme->src->lock); line = syme->src->source; while (line) { - total += line->count[sym_counter]; + total += line->count[top.sym_counter]; line = line->next; } @@ -376,10 +336,10 @@ static void show_details(struct sym_entry *syme) line_queue = line; line_queue_count++; - if (line->count[sym_counter]) - pcnt = 100.0 * line->count[sym_counter] / (float)total; + if (line->count[top.sym_counter]) + pcnt = 100.0 * line->count[top.sym_counter] / (float)total; if (pcnt >= (float)sym_pcnt_filter) { - if (displayed <= print_entries) + if (displayed <= top.print_entries) show_lines(line_queue, line_queue_count, total); else more++; displayed += line_queue_count; @@ -390,7 +350,7 @@ static void show_details(struct sym_entry *syme) line_queue_count--; } - line->count[sym_counter] = zero ? 0 : line->count[sym_counter] * 7 / 8; + line->count[top.sym_counter] = top.zero ? 0 : line->count[top.sym_counter] * 7 / 8; line = line->next; } pthread_mutex_unlock(&syme->src->lock); @@ -398,181 +358,30 @@ static void show_details(struct sym_entry *syme) printf("%d lines not displayed, maybe increase display entries [e]\n", more); } -/* - * Symbols will be added here in perf_event__process_sample and will get out - * after decayed. - */ -static LIST_HEAD(active_symbols); -static pthread_mutex_t active_symbols_lock = PTHREAD_MUTEX_INITIALIZER; - -/* - * Ordering weight: count-1 * count-2 * ... / count-n - */ -static double sym_weight(const struct sym_entry *sym) -{ - double weight = sym->snap_count; - int counter; - - if (!display_weighted) - return weight; - - for (counter = 1; counter < evsel_list->nr_entries - 1; counter++) - weight *= sym->count[counter]; - - weight /= (sym->count[counter] + 1); - - return weight; -} - -static long samples; -static long kernel_samples, us_samples; -static long exact_samples; -static long guest_us_samples, guest_kernel_samples; static const char CONSOLE_CLEAR[] = ""; static void __list_insert_active_sym(struct sym_entry *syme) { - list_add(&syme->node, &active_symbols); -} - -static void list_remove_active_sym(struct sym_entry *syme) -{ - pthread_mutex_lock(&active_symbols_lock); - list_del_init(&syme->node); - pthread_mutex_unlock(&active_symbols_lock); -} - -static void rb_insert_active_sym(struct rb_root *tree, struct sym_entry *se) -{ - struct rb_node **p = &tree->rb_node; - struct rb_node *parent = NULL; - struct sym_entry *iter; - - while (*p != NULL) { - parent = *p; - iter = rb_entry(parent, struct sym_entry, rb_node); - - if (se->weight > iter->weight) - p = &(*p)->rb_left; - else - p = &(*p)->rb_right; - } - - rb_link_node(&se->rb_node, parent, p); - rb_insert_color(&se->rb_node, tree); + list_add(&syme->node, &top.active_symbols); } static void print_sym_table(struct perf_session *session) { - int printed = 0, j; - struct perf_evsel *counter; - int snap = !display_weighted ? sym_counter : 0; - float samples_per_sec = samples/delay_secs; - float ksamples_per_sec = kernel_samples/delay_secs; - float us_samples_per_sec = (us_samples)/delay_secs; - float guest_kernel_samples_per_sec = (guest_kernel_samples)/delay_secs; - float guest_us_samples_per_sec = (guest_us_samples)/delay_secs; - float esamples_percent = (100.0*exact_samples)/samples; - float sum_ksamples = 0.0; - struct sym_entry *syme, *n; - struct rb_root tmp = RB_ROOT; + char bf[160]; + int printed = 0; struct rb_node *nd; - int sym_width = 0, dso_width = 0, dso_short_width = 0; + struct sym_entry *syme; + struct rb_root tmp = RB_ROOT; const int win_width = winsize.ws_col - 1; - - samples = us_samples = kernel_samples = exact_samples = 0; - guest_kernel_samples = guest_us_samples = 0; - - /* Sort the active symbols */ - pthread_mutex_lock(&active_symbols_lock); - syme = list_entry(active_symbols.next, struct sym_entry, node); - pthread_mutex_unlock(&active_symbols_lock); - - list_for_each_entry_safe_from(syme, n, &active_symbols, node) { - syme->snap_count = syme->count[snap]; - if (syme->snap_count != 0) { - - if ((hide_user_symbols && - syme->origin == PERF_RECORD_MISC_USER) || - (hide_kernel_symbols && - syme->origin == PERF_RECORD_MISC_KERNEL)) { - list_remove_active_sym(syme); - continue; - } - syme->weight = sym_weight(syme); - rb_insert_active_sym(&tmp, syme); - sum_ksamples += syme->snap_count; - - for (j = 0; j < evsel_list->nr_entries; j++) - syme->count[j] = zero ? 0 : syme->count[j] * 7 / 8; - } else - list_remove_active_sym(syme); - } + int sym_width, dso_width, dso_short_width; + float sum_ksamples = perf_top__decay_samples(&top, &tmp); puts(CONSOLE_CLEAR); - if (!perf_guest) { - printf(" PerfTop:%8.0f irqs/sec kernel:%4.1f%%" - " exact: %4.1f%% [", - samples_per_sec, - 100.0 - (100.0 * ((samples_per_sec - ksamples_per_sec) / - samples_per_sec)), - esamples_percent); - } else { - printf(" PerfTop:%8.0f irqs/sec kernel:%4.1f%% us:%4.1f%%" - " guest kernel:%4.1f%% guest us:%4.1f%%" - " exact: %4.1f%% [", - samples_per_sec, - 100.0 - (100.0 * ((samples_per_sec-ksamples_per_sec) / - samples_per_sec)), - 100.0 - (100.0 * ((samples_per_sec-us_samples_per_sec) / - samples_per_sec)), - 100.0 - (100.0 * ((samples_per_sec - - guest_kernel_samples_per_sec) / - samples_per_sec)), - 100.0 - (100.0 * ((samples_per_sec - - guest_us_samples_per_sec) / - samples_per_sec)), - esamples_percent); - } + perf_top__header_snprintf(&top, bf, sizeof(bf)); + printf("%s\n", bf); - if (evsel_list->nr_entries == 1 || !display_weighted) { - struct perf_evsel *first; - first = list_entry(evsel_list->entries.next, struct perf_evsel, node); - printf("%" PRIu64, (uint64_t)first->attr.sample_period); - if (freq) - printf("Hz "); - else - printf(" "); - } - - if (!display_weighted) - printf("%s", event_name(sym_evsel)); - else list_for_each_entry(counter, &evsel_list->entries, node) { - if (counter->idx) - printf("/"); - - printf("%s", event_name(counter)); - } - - printf( "], "); - - if (target_pid != -1) - printf(" (target_pid: %d", target_pid); - else if (target_tid != -1) - printf(" (target_tid: %d", target_tid); - else - printf(" (all"); - - if (cpu_list) - printf(", CPU%s: %s)\n", evsel_list->cpus->nr > 1 ? "s" : "", cpu_list); - else { - if (target_tid != -1) - printf(")\n"); - else - printf(", %d CPU%s)\n", evsel_list->cpus->nr, - evsel_list->cpus->nr > 1 ? "s" : ""); - } + perf_top__reset_sample_counters(&top); printf("%-*.*s\n", win_width, win_width, graph_dotted_line); @@ -587,26 +396,8 @@ static void print_sym_table(struct perf_session *session) return; } - /* - * Find the longest symbol name that will be displayed - */ - for (nd = rb_first(&tmp); nd; nd = rb_next(nd)) { - syme = rb_entry(nd, struct sym_entry, rb_node); - if (++printed > print_entries || - (int)syme->snap_count < count_filter) - continue; - - if (syme->map->dso->long_name_len > dso_width) - dso_width = syme->map->dso->long_name_len; - - if (syme->map->dso->short_name_len > dso_short_width) - dso_short_width = syme->map->dso->short_name_len; - - if (syme->name_len > sym_width) - sym_width = syme->name_len; - } - - printed = 0; + perf_top__find_widths(&top, &tmp, &dso_width, &dso_short_width, + &sym_width); if (sym_width + dso_width > winsize.ws_col - 29) { dso_width = dso_short_width; @@ -614,7 +405,7 @@ static void print_sym_table(struct perf_session *session) sym_width = winsize.ws_col - dso_width - 29; } putchar('\n'); - if (evsel_list->nr_entries == 1) + if (top.evlist->nr_entries == 1) printf(" samples pcnt"); else printf(" weight samples pcnt"); @@ -623,7 +414,7 @@ static void print_sym_table(struct perf_session *session) printf(" RIP "); printf(" %-*.*s DSO\n", sym_width, sym_width, "function"); printf(" %s _______ _____", - evsel_list->nr_entries == 1 ? " " : "______"); + top.evlist->nr_entries == 1 ? " " : "______"); if (verbose) printf(" ________________"); printf(" %-*.*s", sym_width, sym_width, graph_line); @@ -636,13 +427,14 @@ static void print_sym_table(struct perf_session *session) syme = rb_entry(nd, struct sym_entry, rb_node); sym = sym_entry__symbol(syme); - if (++printed > print_entries || (int)syme->snap_count < count_filter) + if (++printed > top.print_entries || + (int)syme->snap_count < top.count_filter) continue; pcnt = 100.0 - (100.0 * ((sum_ksamples - syme->snap_count) / sum_ksamples)); - if (evsel_list->nr_entries == 1 || !display_weighted) + if (top.evlist->nr_entries == 1 || !top.display_weighted) printf("%20.2f ", syme->weight); else printf("%9.1f %10ld ", syme->weight, syme->snap_count); @@ -715,11 +507,11 @@ static void prompt_symbol(struct sym_entry **target, const char *msg) if (p) *p = 0; - pthread_mutex_lock(&active_symbols_lock); - syme = list_entry(active_symbols.next, struct sym_entry, node); - pthread_mutex_unlock(&active_symbols_lock); + pthread_mutex_lock(&top.active_symbols_lock); + syme = list_entry(top.active_symbols.next, struct sym_entry, node); + pthread_mutex_unlock(&top.active_symbols_lock); - list_for_each_entry_safe_from(syme, n, &active_symbols, node) { + list_for_each_entry_safe_from(syme, n, &top.active_symbols, node) { struct symbol *sym = sym_entry__symbol(syme); if (!strcmp(buf, sym->name)) { @@ -749,28 +541,28 @@ static void print_mapped_keys(void) } fprintf(stdout, "\nMapped keys:\n"); - fprintf(stdout, "\t[d] display refresh delay. \t(%d)\n", delay_secs); - fprintf(stdout, "\t[e] display entries (lines). \t(%d)\n", print_entries); + fprintf(stdout, "\t[d] display refresh delay. \t(%d)\n", top.delay_secs); + fprintf(stdout, "\t[e] display entries (lines). \t(%d)\n", top.print_entries); - if (evsel_list->nr_entries > 1) - fprintf(stdout, "\t[E] active event counter. \t(%s)\n", event_name(sym_evsel)); + if (top.evlist->nr_entries > 1) + fprintf(stdout, "\t[E] active event counter. \t(%s)\n", event_name(top.sym_evsel)); - fprintf(stdout, "\t[f] profile display filter (count). \t(%d)\n", count_filter); + fprintf(stdout, "\t[f] profile display filter (count). \t(%d)\n", top.count_filter); fprintf(stdout, "\t[F] annotate display filter (percent). \t(%d%%)\n", sym_pcnt_filter); fprintf(stdout, "\t[s] annotate symbol. \t(%s)\n", name?: "NULL"); fprintf(stdout, "\t[S] stop annotation.\n"); - if (evsel_list->nr_entries > 1) - fprintf(stdout, "\t[w] toggle display weighted/count[E]r. \t(%d)\n", display_weighted ? 1 : 0); + if (top.evlist->nr_entries > 1) + fprintf(stdout, "\t[w] toggle display weighted/count[E]r. \t(%d)\n", top.display_weighted ? 1 : 0); fprintf(stdout, "\t[K] hide kernel_symbols symbols. \t(%s)\n", - hide_kernel_symbols ? "yes" : "no"); + top.hide_kernel_symbols ? "yes" : "no"); fprintf(stdout, "\t[U] hide user symbols. \t(%s)\n", - hide_user_symbols ? "yes" : "no"); - fprintf(stdout, "\t[z] toggle sample zeroing. \t(%d)\n", zero ? 1 : 0); + top.hide_user_symbols ? "yes" : "no"); + fprintf(stdout, "\t[z] toggle sample zeroing. \t(%d)\n", top.zero ? 1 : 0); fprintf(stdout, "\t[qQ] quit.\n"); } @@ -791,7 +583,7 @@ static int key_mapped(int c) return 1; case 'E': case 'w': - return evsel_list->nr_entries > 1 ? 1 : 0; + return top.evlist->nr_entries > 1 ? 1 : 0; default: break; } @@ -826,47 +618,47 @@ static void handle_keypress(struct perf_session *session, int c) switch (c) { case 'd': - prompt_integer(&delay_secs, "Enter display delay"); - if (delay_secs < 1) - delay_secs = 1; + prompt_integer(&top.delay_secs, "Enter display delay"); + if (top.delay_secs < 1) + top.delay_secs = 1; break; case 'e': - prompt_integer(&print_entries, "Enter display entries (lines)"); - if (print_entries == 0) { + prompt_integer(&top.print_entries, "Enter display entries (lines)"); + if (top.print_entries == 0) { sig_winch_handler(SIGWINCH); signal(SIGWINCH, sig_winch_handler); } else signal(SIGWINCH, SIG_DFL); break; case 'E': - if (evsel_list->nr_entries > 1) { + if (top.evlist->nr_entries > 1) { fprintf(stderr, "\nAvailable events:"); - list_for_each_entry(sym_evsel, &evsel_list->entries, node) - fprintf(stderr, "\n\t%d %s", sym_evsel->idx, event_name(sym_evsel)); + list_for_each_entry(top.sym_evsel, &top.evlist->entries, node) + fprintf(stderr, "\n\t%d %s", top.sym_evsel->idx, event_name(top.sym_evsel)); - prompt_integer(&sym_counter, "Enter details event counter"); + prompt_integer(&top.sym_counter, "Enter details event counter"); - if (sym_counter >= evsel_list->nr_entries) { - sym_evsel = list_entry(evsel_list->entries.next, struct perf_evsel, node); - sym_counter = 0; - fprintf(stderr, "Sorry, no such event, using %s.\n", event_name(sym_evsel)); + if (top.sym_counter >= top.evlist->nr_entries) { + top.sym_evsel = list_entry(top.evlist->entries.next, struct perf_evsel, node); + top.sym_counter = 0; + fprintf(stderr, "Sorry, no such event, using %s.\n", event_name(top.sym_evsel)); sleep(1); break; } - list_for_each_entry(sym_evsel, &evsel_list->entries, node) - if (sym_evsel->idx == sym_counter) + list_for_each_entry(top.sym_evsel, &top.evlist->entries, node) + if (top.sym_evsel->idx == top.sym_counter) break; - } else sym_counter = 0; + } else top.sym_counter = 0; break; case 'f': - prompt_integer(&count_filter, "Enter display event count filter"); + prompt_integer(&top.count_filter, "Enter display event count filter"); break; case 'F': prompt_percent(&sym_pcnt_filter, "Enter details display event filter (percent)"); break; case 'K': - hide_kernel_symbols = !hide_kernel_symbols; + top.hide_kernel_symbols = !top.hide_kernel_symbols; break; case 'q': case 'Q': @@ -890,13 +682,13 @@ static void handle_keypress(struct perf_session *session, int c) } break; case 'U': - hide_user_symbols = !hide_user_symbols; + top.hide_user_symbols = !top.hide_user_symbols; break; case 'w': - display_weighted = ~display_weighted; + top.display_weighted = ~top.display_weighted; break; case 'z': - zero = !zero; + top.zero = !top.zero; break; default: break; @@ -917,7 +709,7 @@ static void *display_thread(void *arg __used) tc.c_cc[VTIME] = 0; repeat: - delay_msecs = delay_secs * 1000; + delay_msecs = top.delay_secs * 1000; tcsetattr(0, TCSANOW, &tc); /* trash return*/ getc(stdin); @@ -1005,27 +797,27 @@ static void perf_event__process_sample(const union perf_event *event, struct machine *machine; u8 origin = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; - ++samples; + ++top.samples; switch (origin) { case PERF_RECORD_MISC_USER: - ++us_samples; - if (hide_user_symbols) + ++top.us_samples; + if (top.hide_user_symbols) return; machine = perf_session__find_host_machine(session); break; case PERF_RECORD_MISC_KERNEL: - ++kernel_samples; - if (hide_kernel_symbols) + ++top.kernel_samples; + if (top.hide_kernel_symbols) return; machine = perf_session__find_host_machine(session); break; case PERF_RECORD_MISC_GUEST_KERNEL: - ++guest_kernel_samples; + ++top.guest_kernel_samples; machine = perf_session__find_machine(session, event->ip.pid); break; case PERF_RECORD_MISC_GUEST_USER: - ++guest_us_samples; + ++top.guest_us_samples; /* * TODO: we don't process guest user from host side * except simple counting. @@ -1042,7 +834,7 @@ static void perf_event__process_sample(const union perf_event *event, } if (event->header.misc & PERF_RECORD_MISC_EXACT_IP) - exact_samples++; + top.exact_samples++; if (perf_event__preprocess_sample(event, session, &al, sample, symbol_filter) < 0 || @@ -1093,14 +885,14 @@ static void perf_event__process_sample(const union perf_event *event, struct perf_evsel *evsel; syme->origin = origin; - evsel = perf_evlist__id2evsel(evsel_list, sample->id); + evsel = perf_evlist__id2evsel(top.evlist, sample->id); assert(evsel != NULL); syme->count[evsel->idx]++; record_precise_ip(syme, evsel->idx, ip); - pthread_mutex_lock(&active_symbols_lock); + pthread_mutex_lock(&top.active_symbols_lock); if (list_empty(&syme->node) || !syme->node.next) __list_insert_active_sym(syme); - pthread_mutex_unlock(&active_symbols_lock); + pthread_mutex_unlock(&top.active_symbols_lock); } } @@ -1109,7 +901,7 @@ static void perf_session__mmap_read_cpu(struct perf_session *self, int cpu) struct perf_sample sample; union perf_event *event; - while ((event = perf_evlist__read_on_cpu(evsel_list, cpu)) != NULL) { + while ((event = perf_evlist__read_on_cpu(top.evlist, cpu)) != NULL) { perf_session__parse_sample(self, event, &sample); if (event->header.type == PERF_RECORD_SAMPLE) @@ -1123,7 +915,7 @@ static void perf_session__mmap_read(struct perf_session *self) { int i; - for (i = 0; i < evsel_list->cpus->nr; i++) + for (i = 0; i < top.evlist->cpus->nr; i++) perf_session__mmap_read_cpu(self, i); } @@ -1136,10 +928,10 @@ static void start_counters(struct perf_evlist *evlist) attr->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID; - if (freq) { + if (top.freq) { attr->sample_type |= PERF_SAMPLE_PERIOD; attr->freq = 1; - attr->sample_freq = freq; + attr->sample_freq = top.freq; } if (evlist->nr_entries > 1) { @@ -1149,8 +941,8 @@ static void start_counters(struct perf_evlist *evlist) attr->mmap = 1; try_again: - if (perf_evsel__open(counter, evsel_list->cpus, - evsel_list->threads, group, inherit) < 0) { + if (perf_evsel__open(counter, top.evlist->cpus, + top.evlist->threads, group, inherit) < 0) { int err = errno; if (err == EPERM || err == EACCES) @@ -1198,18 +990,18 @@ static int __cmd_top(void) if (session == NULL) return -ENOMEM; - if (target_tid != -1) - perf_event__synthesize_thread(target_tid, perf_event__process, + if (top.target_tid != -1) + perf_event__synthesize_thread(top.target_tid, perf_event__process, session); else perf_event__synthesize_threads(perf_event__process, session); - start_counters(evsel_list); - first = list_entry(evsel_list->entries.next, struct perf_evsel, node); + start_counters(top.evlist); + first = list_entry(top.evlist->entries.next, struct perf_evsel, node); perf_session__set_sample_type(session, first->attr.sample_type); /* Wait for a minimal set of events before starting the snapshot */ - poll(evsel_list->pollfd, evsel_list->nr_fds, 100); + poll(top.evlist->pollfd, top.evlist->nr_fds, 100); perf_session__mmap_read(session); @@ -1229,12 +1021,12 @@ static int __cmd_top(void) } while (1) { - int hits = samples; + u64 hits = top.samples; perf_session__mmap_read(session); - if (hits == samples) - ret = poll(evsel_list->pollfd, evsel_list->nr_fds, 100); + if (hits == top.samples) + ret = poll(top.evlist->pollfd, top.evlist->nr_fds, 100); } return 0; @@ -1246,31 +1038,31 @@ static const char * const top_usage[] = { }; static const struct option options[] = { - OPT_CALLBACK('e', "event", &evsel_list, "event", + OPT_CALLBACK('e', "event", &top.evlist, "event", "event selector. use 'perf list' to list available events", parse_events), OPT_INTEGER('c', "count", &default_interval, "event period to sample"), - OPT_INTEGER('p', "pid", &target_pid, + OPT_INTEGER('p', "pid", &top.target_pid, "profile events on existing process id"), - OPT_INTEGER('t', "tid", &target_tid, + OPT_INTEGER('t', "tid", &top.target_tid, "profile events on existing thread id"), OPT_BOOLEAN('a', "all-cpus", &system_wide, "system-wide collection from all CPUs"), - OPT_STRING('C', "cpu", &cpu_list, "cpu", + OPT_STRING('C', "cpu", &top.cpu_list, "cpu", "list of cpus to monitor"), OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name, "file", "vmlinux pathname"), - OPT_BOOLEAN('K', "hide_kernel_symbols", &hide_kernel_symbols, + OPT_BOOLEAN('K', "hide_kernel_symbols", &top.hide_kernel_symbols, "hide kernel symbols"), OPT_UINTEGER('m', "mmap-pages", &mmap_pages, "number of mmap data pages"), OPT_INTEGER('r', "realtime", &realtime_prio, "collect data with this RT SCHED_FIFO priority"), - OPT_INTEGER('d', "delay", &delay_secs, + OPT_INTEGER('d', "delay", &top.delay_secs, "number of seconds to delay between refreshes"), OPT_BOOLEAN('D', "dump-symtab", &dump_symtab, "dump the symbol table used for profiling"), - OPT_INTEGER('f', "count-filter", &count_filter, + OPT_INTEGER('f', "count-filter", &top.count_filter, "only display functions with more events than this"), OPT_BOOLEAN('g', "group", &group, "put the counters into a counter group"), @@ -1278,13 +1070,13 @@ static const struct option options[] = { "child tasks inherit counters"), OPT_STRING('s', "sym-annotate", &sym_filter, "symbol name", "symbol to annotate"), - OPT_BOOLEAN('z', "zero", &zero, + OPT_BOOLEAN('z', "zero", &top.zero, "zero history across updates"), - OPT_INTEGER('F', "freq", &freq, + OPT_INTEGER('F', "freq", &top.freq, "profile at this frequency"), - OPT_INTEGER('E', "entries", &print_entries, + OPT_INTEGER('E', "entries", &top.print_entries, "display this many functions"), - OPT_BOOLEAN('U', "hide_user_symbols", &hide_user_symbols, + OPT_BOOLEAN('U', "hide_user_symbols", &top.hide_user_symbols, "hide user symbols"), OPT_INCR('v', "verbose", &verbose, "be more verbose (show counter open errors, etc)"), @@ -1296,8 +1088,8 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) struct perf_evsel *pos; int status = -ENOMEM; - evsel_list = perf_evlist__new(NULL, NULL); - if (evsel_list == NULL) + top.evlist = perf_evlist__new(NULL, NULL); + if (top.evlist == NULL) return -ENOMEM; page_size = sysconf(_SC_PAGE_SIZE); @@ -1307,43 +1099,43 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) usage_with_options(top_usage, options); /* CPU and PID are mutually exclusive */ - if (target_tid > 0 && cpu_list) { + if (top.target_tid > 0 && top.cpu_list) { printf("WARNING: PID switch overriding CPU\n"); sleep(1); - cpu_list = NULL; + top.cpu_list = NULL; } - if (target_pid != -1) - target_tid = target_pid; + if (top.target_pid != -1) + top.target_tid = top.target_pid; - if (perf_evlist__create_maps(evsel_list, target_pid, - target_tid, cpu_list) < 0) + if (perf_evlist__create_maps(top.evlist, top.target_pid, + top.target_tid, top.cpu_list) < 0) usage_with_options(top_usage, options); - if (!evsel_list->nr_entries && - perf_evlist__add_default(evsel_list) < 0) { + if (!top.evlist->nr_entries && + perf_evlist__add_default(top.evlist) < 0) { pr_err("Not enough memory for event selector list\n"); return -ENOMEM; } - if (delay_secs < 1) - delay_secs = 1; + if (top.delay_secs < 1) + top.delay_secs = 1; /* * User specified count overrides default frequency. */ if (default_interval) - freq = 0; - else if (freq) { - default_interval = freq; + top.freq = 0; + else if (top.freq) { + default_interval = top.freq; } else { fprintf(stderr, "frequency and count are zero, aborting\n"); exit(EXIT_FAILURE); } - list_for_each_entry(pos, &evsel_list->entries, node) { - if (perf_evsel__alloc_fd(pos, evsel_list->cpus->nr, - evsel_list->threads->nr) < 0) + list_for_each_entry(pos, &top.evlist->entries, node) { + if (perf_evsel__alloc_fd(pos, top.evlist->cpus->nr, + top.evlist->threads->nr) < 0) goto out_free_fd; /* * Fill in the ones not specifically initialized via -c: @@ -1354,28 +1146,28 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) pos->attr.sample_period = default_interval; } - if (perf_evlist__alloc_pollfd(evsel_list) < 0 || - perf_evlist__alloc_mmap(evsel_list) < 0) + if (perf_evlist__alloc_pollfd(top.evlist) < 0 || + perf_evlist__alloc_mmap(top.evlist) < 0) goto out_free_fd; - sym_evsel = list_entry(evsel_list->entries.next, struct perf_evsel, node); + top.sym_evsel = list_entry(top.evlist->entries.next, struct perf_evsel, node); symbol_conf.priv_size = (sizeof(struct sym_entry) + - (evsel_list->nr_entries + 1) * sizeof(unsigned long)); + (top.evlist->nr_entries + 1) * sizeof(unsigned long)); symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL); if (symbol__init() < 0) return -1; get_term_dimensions(&winsize); - if (print_entries == 0) { + if (top.print_entries == 0) { update_print_entries(&winsize); signal(SIGWINCH, sig_winch_handler); } status = __cmd_top(); out_free_fd: - perf_evlist__delete(evsel_list); + perf_evlist__delete(top.evlist); return status; } diff --git a/tools/perf/util/top.c b/tools/perf/util/top.c new file mode 100644 index 000000000000..c06cc5386e7e --- /dev/null +++ b/tools/perf/util/top.c @@ -0,0 +1,212 @@ +/* + * Copyright (C) 2011, Red Hat Inc, Arnaldo Carvalho de Melo + * + * Refactored from builtin-top.c, see that files for further copyright notes. + * + * Released under the GPL v2. (and only v2, not any later version) + */ + +#include "cpumap.h" +#include "event.h" +#include "evlist.h" +#include "evsel.h" +#include "parse-events.h" +#include "symbol.h" +#include "top.h" +#include + +/* + * Ordering weight: count-1 * count-2 * ... / count-n + */ +static double sym_weight(const struct sym_entry *sym, struct perf_top *top) +{ + double weight = sym->snap_count; + int counter; + + if (!top->display_weighted) + return weight; + + for (counter = 1; counter < top->evlist->nr_entries - 1; counter++) + weight *= sym->count[counter]; + + weight /= (sym->count[counter] + 1); + + return weight; +} + +static void perf_top__remove_active_sym(struct perf_top *top, struct sym_entry *syme) +{ + pthread_mutex_lock(&top->active_symbols_lock); + list_del_init(&syme->node); + pthread_mutex_unlock(&top->active_symbols_lock); +} + +static void rb_insert_active_sym(struct rb_root *tree, struct sym_entry *se) +{ + struct rb_node **p = &tree->rb_node; + struct rb_node *parent = NULL; + struct sym_entry *iter; + + while (*p != NULL) { + parent = *p; + iter = rb_entry(parent, struct sym_entry, rb_node); + + if (se->weight > iter->weight) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + + rb_link_node(&se->rb_node, parent, p); + rb_insert_color(&se->rb_node, tree); +} + +size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size) +{ + struct perf_evsel *counter; + float samples_per_sec = top->samples / top->delay_secs; + float ksamples_per_sec = top->kernel_samples / top->delay_secs; + float esamples_percent = (100.0 * top->exact_samples) / top->samples; + size_t ret = 0; + + if (!perf_guest) { + ret = snprintf(bf, size, + " PerfTop:%8.0f irqs/sec kernel:%4.1f%%" + " exact: %4.1f%% [", samples_per_sec, + 100.0 - (100.0 * ((samples_per_sec - ksamples_per_sec) / + samples_per_sec)), + esamples_percent); + } else { + float us_samples_per_sec = top->us_samples / top->delay_secs; + float guest_kernel_samples_per_sec = top->guest_kernel_samples / top->delay_secs; + float guest_us_samples_per_sec = top->guest_us_samples / top->delay_secs; + + ret = snprintf(bf, size, + " PerfTop:%8.0f irqs/sec kernel:%4.1f%% us:%4.1f%%" + " guest kernel:%4.1f%% guest us:%4.1f%%" + " exact: %4.1f%% [", samples_per_sec, + 100.0 - (100.0 * ((samples_per_sec - ksamples_per_sec) / + samples_per_sec)), + 100.0 - (100.0 * ((samples_per_sec - us_samples_per_sec) / + samples_per_sec)), + 100.0 - (100.0 * ((samples_per_sec - + guest_kernel_samples_per_sec) / + samples_per_sec)), + 100.0 - (100.0 * ((samples_per_sec - + guest_us_samples_per_sec) / + samples_per_sec)), + esamples_percent); + } + + if (top->evlist->nr_entries == 1 || !top->display_weighted) { + struct perf_evsel *first; + first = list_entry(top->evlist->entries.next, struct perf_evsel, node); + ret += snprintf(bf + ret, size - ret, "%" PRIu64 "%s ", + (uint64_t)first->attr.sample_period, + top->freq ? "Hz" : ""); + } + + if (!top->display_weighted) { + ret += snprintf(bf + ret, size - ret, "%s", + event_name(top->sym_evsel)); + } else list_for_each_entry(counter, &top->evlist->entries, node) { + ret += snprintf(bf + ret, size - ret, "%s%s", + counter->idx ? "/" : "", event_name(counter)); + } + + ret += snprintf(bf + ret, size - ret, "], "); + + if (top->target_pid != -1) + ret += snprintf(bf + ret, size - ret, " (target_pid: %d", + top->target_pid); + else if (top->target_tid != -1) + ret += snprintf(bf + ret, size - ret, " (target_tid: %d", + top->target_tid); + else + ret += snprintf(bf + ret, size - ret, " (all"); + + if (top->cpu_list) + ret += snprintf(bf + ret, size - ret, ", CPU%s: %s)", + top->evlist->cpus->nr > 1 ? "s" : "", top->cpu_list); + else { + if (top->target_tid != -1) + ret += snprintf(bf + ret, size - ret, ")"); + else + ret += snprintf(bf + ret, size - ret, ", %d CPU%s)", + top->evlist->cpus->nr, + top->evlist->cpus->nr > 1 ? "s" : ""); + } + + return ret; +} + +void perf_top__reset_sample_counters(struct perf_top *top) +{ + top->samples = top->us_samples = top->kernel_samples = + top->exact_samples = top->guest_kernel_samples = + top->guest_us_samples = 0; +} + +float perf_top__decay_samples(struct perf_top *top, struct rb_root *root) +{ + struct sym_entry *syme, *n; + float sum_ksamples = 0.0; + int snap = !top->display_weighted ? top->sym_counter : 0, j; + + /* Sort the active symbols */ + pthread_mutex_lock(&top->active_symbols_lock); + syme = list_entry(top->active_symbols.next, struct sym_entry, node); + pthread_mutex_unlock(&top->active_symbols_lock); + + list_for_each_entry_safe_from(syme, n, &top->active_symbols, node) { + syme->snap_count = syme->count[snap]; + if (syme->snap_count != 0) { + + if ((top->hide_user_symbols && + syme->origin == PERF_RECORD_MISC_USER) || + (top->hide_kernel_symbols && + syme->origin == PERF_RECORD_MISC_KERNEL)) { + perf_top__remove_active_sym(top, syme); + continue; + } + syme->weight = sym_weight(syme, top); + rb_insert_active_sym(root, syme); + sum_ksamples += syme->snap_count; + + for (j = 0; j < top->evlist->nr_entries; j++) + syme->count[j] = top->zero ? 0 : syme->count[j] * 7 / 8; + } else + perf_top__remove_active_sym(top, syme); + } + + return sum_ksamples; +} + +/* + * Find the longest symbol name that will be displayed + */ +void perf_top__find_widths(struct perf_top *top, struct rb_root *root, + int *dso_width, int *dso_short_width, int *sym_width) +{ + struct rb_node *nd; + int printed = 0; + + *sym_width = *dso_width = *dso_short_width = 0; + + for (nd = rb_first(root); nd; nd = rb_next(nd)) { + struct sym_entry *syme = rb_entry(nd, struct sym_entry, rb_node); + + if (++printed > top->print_entries || + (int)syme->snap_count < top->count_filter) + continue; + + if (syme->map->dso->long_name_len > *dso_width) + *dso_width = syme->map->dso->long_name_len; + + if (syme->map->dso->short_name_len > *dso_short_width) + *dso_short_width = syme->map->dso->short_name_len; + + if (syme->name_len > *sym_width) + *sym_width = syme->name_len; + } +} diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h new file mode 100644 index 000000000000..0467b26dd8d8 --- /dev/null +++ b/tools/perf/util/top.h @@ -0,0 +1,67 @@ +#ifndef __PERF_TOP_H +#define __PERF_TOP_H 1 + +#include "types.h" +#include "../perf.h" +#include +#include +#include +#include + +struct perf_evlist; +struct perf_evsel; + +struct source_line { + u64 eip; + unsigned long count[MAX_COUNTERS]; /* FIXME */ + char *line; + struct source_line *next; +}; + +struct sym_entry_source { + struct source_line *source; + struct source_line *lines; + struct source_line **lines_tail; + pthread_mutex_t lock; +}; + +struct sym_entry { + struct rb_node rb_node; + struct list_head node; + unsigned long snap_count; + double weight; + int skip; + u16 name_len; + u8 origin; + struct map *map; + struct sym_entry_source *src; + unsigned long count[0]; +}; + +struct perf_top { + struct perf_evlist *evlist; + /* + * Symbols will be added here in perf_event__process_sample and will + * get out after decayed. + */ + struct list_head active_symbols; + pthread_mutex_t active_symbols_lock; + u64 samples; + u64 kernel_samples, us_samples; + u64 exact_samples; + u64 guest_us_samples, guest_kernel_samples; + int print_entries, count_filter, delay_secs; + int display_weighted, freq; + int sym_counter, target_pid, target_tid; + bool hide_kernel_symbols, hide_user_symbols, zero; + const char *cpu_list; + struct perf_evsel *sym_evsel; +}; + +size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size); +void perf_top__reset_sample_counters(struct perf_top *top); +float perf_top__decay_samples(struct perf_top *top, struct rb_root *root); +void perf_top__find_widths(struct perf_top *top, struct rb_root *root, + int *dso_width, int *dso_short_width, int *sym_width); + +#endif /* __PERF_TOP_H */ From eff9073790e1286aa12bf1c65814d3e0132b12e1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 31 Jan 2011 16:59:05 +0100 Subject: [PATCH 122/706] x86: Rename incorrectly named parameter of numa_cpu_node() numa_cpu_node() prototype in numa_32.h has wrongly named parameter @apicid when it actually takes the CPU number. Change it to @cpu. Reported-by: Yinghai Lu Signed-off-by: Tejun Heo LKML-Reference: <20110131155905.GM7459@htj.dyndns.org> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/numa_32.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h index bc66031afa1f..c6beed1ef103 100644 --- a/arch/x86/include/asm/numa_32.h +++ b/arch/x86/include/asm/numa_32.h @@ -6,7 +6,7 @@ extern int numa_off; extern int pxm_to_nid(int pxm); #ifdef CONFIG_NUMA -extern int __cpuinit numa_cpu_node(int apicid); +extern int __cpuinit numa_cpu_node(int cpu); #else /* CONFIG_NUMA */ static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; } #endif /* CONFIG_NUMA */ From e2830b5c1b2b2217894370a3b95af87d4a958401 Mon Sep 17 00:00:00 2001 From: Torben Hohn Date: Thu, 27 Jan 2011 16:00:32 +0100 Subject: [PATCH 123/706] time: Make do_timer() and xtime_lock local to kernel/time/ All callers of do_timer() are converted to xtime_update(). The only users of xtime_lock are in kernel/time/. Make both local to kernel/time/ and remove them from the global header files. [ tglx: Reuse tick-internal.h instead of creating another local header file. Massaged changelog ] Signed-off-by: Torben Hohn Cc: Peter Zijlstra Cc: johnstul@us.ibm.com Cc: yong.zhang0@gmail.com Cc: hch@infradead.org Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 1 - include/linux/time.h | 2 -- kernel/time/clockevents.c | 1 - kernel/time/jiffies.c | 2 ++ kernel/time/ntp.c | 2 ++ kernel/time/tick-broadcast.c | 1 - kernel/time/tick-common.c | 1 - kernel/time/tick-internal.h | 5 +++++ kernel/time/tick-oneshot.c | 1 - kernel/time/tick-sched.c | 1 - 10 files changed, 9 insertions(+), 8 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 9d9a0787eed3..cdef640aa446 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2049,7 +2049,6 @@ extern void release_uids(struct user_namespace *ns); #include -extern void do_timer(unsigned long ticks); extern void xtime_update(unsigned long ticks); extern int wake_up_state(struct task_struct *tsk, unsigned int state); diff --git a/include/linux/time.h b/include/linux/time.h index ce29c86882b1..38c5206c2673 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -113,8 +113,6 @@ static inline struct timespec timespec_sub(struct timespec lhs, #define timespec_valid(ts) \ (((ts)->tv_sec >= 0) && (((unsigned long) (ts)->tv_nsec) < NSEC_PER_SEC)) -extern seqlock_t xtime_lock; - extern void read_persistent_clock(struct timespec *ts); extern void read_boot_clock(struct timespec *ts); extern int update_persistent_clock(struct timespec now); diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index d7395fdfb9f3..0d74b9ba90c8 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -18,7 +18,6 @@ #include #include #include -#include #include "tick-internal.h" diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 2fbc20744797..b2fa506667c0 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -25,6 +25,8 @@ #include #include +#include "tick-internal.h" + /* The Jiffies based clocksource is the lowest common * denominator clock source which should function on * all systems. It has the same coarse resolution as diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 5c00242fa921..ed8cfdf16983 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -16,6 +16,8 @@ #include #include +#include "tick-internal.h" + /* * NTP timekeeping variables: */ diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 48b2761b5668..92ef9a54f0a4 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -18,7 +18,6 @@ #include #include #include -#include #include "tick-internal.h" diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 051bc80a0c43..0e98fac3d479 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -18,7 +18,6 @@ #include #include #include -#include #include diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 290eefbc1f60..28c578568c9d 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -1,6 +1,8 @@ /* * tick internal variable and functions used by low/high res code */ +#include +#include #define TICK_DO_TIMER_NONE -1 #define TICK_DO_TIMER_BOOT -2 @@ -132,3 +134,6 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) { return !(dev->features & CLOCK_EVT_FEAT_DUMMY); } + +extern void do_timer(unsigned long ticks); +extern seqlock_t xtime_lock; diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 5cbc101f908b..2d04411a5f05 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -18,7 +18,6 @@ #include #include #include -#include #include "tick-internal.h" diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index c55ea2433471..d5097c44b407 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include From 229ade9ba36341f7369ecb4f134bcec9133520bf Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 31 Jan 2011 18:08:39 -0200 Subject: [PATCH 124/706] perf tools: Don't fallback to setup_pager unconditionally Because in tools like 'top' we don't want the pager. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 2 +- tools/perf/builtin-report.c | 2 +- tools/perf/util/cache.h | 7 ++++--- tools/perf/util/ui/setup.c | 5 +++-- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 70067862e07f..cd9dec46c19f 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -452,7 +452,7 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __used) else if (use_tui) use_browser = 1; - setup_browser(); + setup_browser(true); symbol_conf.priv_size = sizeof(struct sym_priv); symbol_conf.try_vmlinux_path = true; diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index a6a4e5457b6f..080937c3a656 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -499,7 +499,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __used) use_browser = 1; if (strcmp(input_name, "-") != 0) - setup_browser(); + setup_browser(true); else use_browser = 0; /* diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h index a7729797fd96..fc5e5a09d5b9 100644 --- a/tools/perf/util/cache.h +++ b/tools/perf/util/cache.h @@ -34,13 +34,14 @@ extern int pager_use_color; extern int use_browser; #ifdef NO_NEWT_SUPPORT -static inline void setup_browser(void) +static inline void setup_browser(bool fallback_to_pager) { - setup_pager(); + if (fallback_to_pager) + setup_pager(); } static inline void exit_browser(bool wait_for_ok __used) {} #else -void setup_browser(void); +void setup_browser(bool fallback_to_pager); void exit_browser(bool wait_for_ok); #endif diff --git a/tools/perf/util/ui/setup.c b/tools/perf/util/ui/setup.c index 662085032eb7..fbf1a145492f 100644 --- a/tools/perf/util/ui/setup.c +++ b/tools/perf/util/ui/setup.c @@ -14,11 +14,12 @@ static void newt_suspend(void *d __used) newtResume(); } -void setup_browser(void) +void setup_browser(bool fallback_to_pager) { if (!isatty(1) || !use_browser || dump_trace) { use_browser = 0; - setup_pager(); + if (fallback_to_pager) + setup_pager(); return; } From c0443df1b69b59675fc6790e0ddce87c8ca00abf Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 31 Jan 2011 18:19:33 -0200 Subject: [PATCH 125/706] perf top: Introduce slang based TUI Disabled by default as there are features found in the stdio based one that aren't implemented, like live annotation, filtering knobs data entry. Annotation hopefully will get somehow merged with the 'perf annotate' code. To use it: perf top --tui Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile | 4 + tools/perf/builtin-top.c | 35 ++++++-- tools/perf/util/top.c | 7 +- tools/perf/util/top.h | 15 +++- tools/perf/util/ui/browsers/top.c | 136 ++++++++++++++++++++++++++++++ 5 files changed, 189 insertions(+), 8 deletions(-) create mode 100644 tools/perf/util/ui/browsers/top.c diff --git a/tools/perf/Makefile b/tools/perf/Makefile index edc660e0bc2f..67a9f4d805de 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -621,6 +621,7 @@ else LIB_OBJS += $(OUTPUT)util/ui/browsers/annotate.o LIB_OBJS += $(OUTPUT)util/ui/browsers/hists.o LIB_OBJS += $(OUTPUT)util/ui/browsers/map.o + LIB_OBJS += $(OUTPUT)util/ui/browsers/top.o LIB_OBJS += $(OUTPUT)util/ui/helpline.o LIB_OBJS += $(OUTPUT)util/ui/progress.o LIB_OBJS += $(OUTPUT)util/ui/util.o @@ -1050,6 +1051,9 @@ $(OUTPUT)util/ui/browser.o: util/ui/browser.c $(OUTPUT)PERF-CFLAGS $(OUTPUT)util/ui/browsers/annotate.o: util/ui/browsers/annotate.c $(OUTPUT)PERF-CFLAGS $(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DENABLE_SLFUTURE_CONST $< +$(OUTPUT)util/ui/browsers/top.o: util/ui/browsers/top.c $(OUTPUT)PERF-CFLAGS + $(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DENABLE_SLFUTURE_CONST $< + $(OUTPUT)util/ui/browsers/hists.o: util/ui/browsers/hists.c $(OUTPUT)PERF-CFLAGS $(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DENABLE_SLFUTURE_CONST $< diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 3c9ba943aa48..104de9ab314c 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -20,6 +20,7 @@ #include "perf.h" +#include "util/cache.h" #include "util/color.h" #include "util/evlist.h" #include "util/evsel.h" @@ -75,6 +76,8 @@ static struct perf_top top = { static bool system_wide = false; +static bool use_tui, use_stdio; + static int default_interval = 0; static bool inherit = false; @@ -96,11 +99,6 @@ static int sym_pcnt_filter = 5; * Source functions */ -static inline struct symbol *sym_entry__symbol(struct sym_entry *self) -{ - return ((void *)self) + symbol_conf.priv_size; -} - void get_term_dimensions(struct winsize *ws) { char *s = getenv("LINES"); @@ -695,6 +693,14 @@ static void handle_keypress(struct perf_session *session, int c) } } +static void *display_thread_tui(void *arg __used) +{ + perf_top__tui_browser(&top); + exit_browser(0); + exit(0); + return NULL; +} + static void *display_thread(void *arg __used) { struct pollfd stdin_poll = { .fd = 0, .events = POLLIN }; @@ -1005,7 +1011,8 @@ static int __cmd_top(void) perf_session__mmap_read(session); - if (pthread_create(&thread, NULL, display_thread, session)) { + if (pthread_create(&thread, NULL, (use_browser > 0 ? display_thread_tui : + display_thread), session)) { printf("Could not create display thread.\n"); exit(-1); } @@ -1078,6 +1085,8 @@ static const struct option options[] = { "display this many functions"), OPT_BOOLEAN('U', "hide_user_symbols", &top.hide_user_symbols, "hide user symbols"), + OPT_BOOLEAN(0, "tui", &use_tui, "Use the TUI interface"), + OPT_BOOLEAN(0, "stdio", &use_stdio, "Use the stdio interface"), OPT_INCR('v', "verbose", &verbose, "be more verbose (show counter open errors, etc)"), OPT_END() @@ -1098,6 +1107,20 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) if (argc) usage_with_options(top_usage, options); + /* + * XXX For now start disabled, only using TUI if explicitely asked for. + * Change that when handle_keys equivalent gets written, live annotation + * done, etc. + */ + use_browser = 0; + + if (use_stdio) + use_browser = 0; + else if (use_tui) + use_browser = 1; + + setup_browser(false); + /* CPU and PID are mutually exclusive */ if (top.target_tid > 0 && top.cpu_list) { printf("WARNING: PID switch overriding CPU\n"); diff --git a/tools/perf/util/top.c b/tools/perf/util/top.c index c06cc5386e7e..1d2e2652cd68 100644 --- a/tools/perf/util/top.c +++ b/tools/perf/util/top.c @@ -158,6 +158,7 @@ float perf_top__decay_samples(struct perf_top *top, struct rb_root *root) syme = list_entry(top->active_symbols.next, struct sym_entry, node); pthread_mutex_unlock(&top->active_symbols_lock); + top->rb_entries = 0; list_for_each_entry_safe_from(syme, n, &top->active_symbols, node) { syme->snap_count = syme->count[snap]; if (syme->snap_count != 0) { @@ -170,7 +171,11 @@ float perf_top__decay_samples(struct perf_top *top, struct rb_root *root) continue; } syme->weight = sym_weight(syme, top); - rb_insert_active_sym(root, syme); + + if ((int)syme->snap_count >= top->count_filter) { + rb_insert_active_sym(root, syme); + ++top->rb_entries; + } sum_ksamples += syme->snap_count; for (j = 0; j < top->evlist->nr_entries; j++) diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h index 0467b26dd8d8..611370fa7df8 100644 --- a/tools/perf/util/top.h +++ b/tools/perf/util/top.h @@ -38,6 +38,11 @@ struct sym_entry { unsigned long count[0]; }; +static inline struct symbol *sym_entry__symbol(struct sym_entry *self) +{ + return ((void *)self) + symbol_conf.priv_size; +} + struct perf_top { struct perf_evlist *evlist; /* @@ -51,7 +56,7 @@ struct perf_top { u64 exact_samples; u64 guest_us_samples, guest_kernel_samples; int print_entries, count_filter, delay_secs; - int display_weighted, freq; + int display_weighted, freq, rb_entries; int sym_counter, target_pid, target_tid; bool hide_kernel_symbols, hide_user_symbols, zero; const char *cpu_list; @@ -64,4 +69,12 @@ float perf_top__decay_samples(struct perf_top *top, struct rb_root *root); void perf_top__find_widths(struct perf_top *top, struct rb_root *root, int *dso_width, int *dso_short_width, int *sym_width); +#ifdef NO_NEWT_SUPPORT +static inline int perf_top__tui_browser(struct perf_top *top __used) +{ + return 0; +} +#else +int perf_top__tui_browser(struct perf_top *top); +#endif #endif /* __PERF_TOP_H */ diff --git a/tools/perf/util/ui/browsers/top.c b/tools/perf/util/ui/browsers/top.c new file mode 100644 index 000000000000..ca6062483a8f --- /dev/null +++ b/tools/perf/util/ui/browsers/top.c @@ -0,0 +1,136 @@ +/* + * Copyright (C) 2011, Red Hat Inc, Arnaldo Carvalho de Melo + * + * Parts came from builtin-{top,stat,record}.c, see those files for further + * copyright notes. + * + * Released under the GPL v2. (and only v2, not any later version) + */ +#include "../browser.h" +#include "../helpline.h" +#include "../libslang.h" +#include "../../evlist.h" +#include "../../hist.h" +#include "../../sort.h" +#include "../../symbol.h" +#include "../../top.h" + +struct perf_top_browser { + struct ui_browser b; + struct rb_root root; + float sum_ksamples; + int dso_width; + int dso_short_width; + int sym_width; +}; + +static void perf_top_browser__write(struct ui_browser *browser, void *entry, int row) +{ + struct perf_top_browser *top_browser = container_of(browser, struct perf_top_browser, b); + struct sym_entry *syme = rb_entry(entry, struct sym_entry, rb_node); + bool current_entry = ui_browser__is_current_entry(browser, row); + struct symbol *symbol = sym_entry__symbol(syme); + struct perf_top *top = browser->priv; + int width = browser->width; + double pcnt; + + pcnt = 100.0 - (100.0 * ((top_browser->sum_ksamples - syme->snap_count) / + top_browser->sum_ksamples)); + ui_browser__set_percent_color(browser, pcnt, current_entry); + + if (top->evlist->nr_entries == 1 || !top->display_weighted) { + slsmg_printf("%20.2f ", syme->weight); + width -= 24; + } else { + slsmg_printf("%9.1f %10ld ", syme->weight, syme->snap_count); + width -= 23; + } + + slsmg_printf("%4.1f%%", pcnt); + width -= 7; + + if (verbose) { + slsmg_printf(" %016" PRIx64, symbol->start); + width -= 17; + } + + slsmg_printf(" %-*.*s ", top_browser->sym_width, top_browser->sym_width, + symbol->name); + width -= top_browser->sym_width; + slsmg_write_nstring(width >= syme->map->dso->long_name_len ? + syme->map->dso->long_name : + syme->map->dso->short_name, width); +} + +static void perf_top_browser__update_rb_tree(struct perf_top_browser *browser) +{ + struct perf_top *top = browser->b.priv; + + browser->root = RB_ROOT; + browser->b.top = NULL; + browser->sum_ksamples = perf_top__decay_samples(top, &browser->root); + perf_top__find_widths(top, &browser->root, &browser->dso_width, + &browser->dso_short_width, + &browser->sym_width); + if (browser->sym_width + browser->dso_width > browser->b.width - 29) { + browser->dso_width = browser->dso_short_width; + if (browser->sym_width + browser->dso_width > browser->b.width - 29) + browser->sym_width = browser->b.width - browser->dso_width - 29; + } + browser->b.nr_entries = top->rb_entries; +} + +static int perf_top_browser__run(struct perf_top_browser *browser) +{ + int key; + char title[160]; + struct perf_top *top = browser->b.priv; + int delay_msecs = top->delay_secs * 1000; + + perf_top_browser__update_rb_tree(browser); + perf_top__header_snprintf(top, title, sizeof(title)); + perf_top__reset_sample_counters(top); + + if (ui_browser__show(&browser->b, title, "ESC: exit") < 0) + return -1; + + newtFormSetTimer(browser->b.form, delay_msecs); + + while (1) { + key = ui_browser__run(&browser->b); + + switch (key) { + case -1: + /* FIXME we need to check if it was es.reason == NEWT_EXIT_TIMER */ + perf_top_browser__update_rb_tree(browser); + perf_top__header_snprintf(top, title, sizeof(title)); + perf_top__reset_sample_counters(top); + ui_browser__set_color(&browser->b, NEWT_COLORSET_ROOT); + SLsmg_gotorc(0, 0); + slsmg_write_nstring(title, browser->b.width); + break; + case NEWT_KEY_TAB: + default: + goto out; + } + } +out: + ui_browser__hide(&browser->b); + return key; +} + +int perf_top__tui_browser(struct perf_top *top) +{ + struct perf_top_browser browser = { + .b = { + .entries = &browser.root, + .refresh = ui_browser__rb_tree_refresh, + .seek = ui_browser__rb_tree_seek, + .write = perf_top_browser__write, + .priv = top, + }, + }; + + ui_helpline__push("Press <- or ESC to exit"); + return perf_top_browser__run(&browser); +} From 823c7164a92a6347d46bb64aaae728b6d08f3bb8 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 31 Jan 2011 19:45:38 -0200 Subject: [PATCH 126/706] perf probe: Use %td for pointer arithmetic result MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit %td is for ptrdiff_t, avoiding this warning on 32-bit: cc1: warnings being treated as errors builtin-probe.c: In function ‘opt_set_filter’: builtin-probe.c:176:4: error: format ‘%ld’ expects type ‘long int’, but argument 3 has type ‘int’ Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-probe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c index fcde0031085f..2c0e64d0b4aa 100644 --- a/tools/perf/builtin-probe.c +++ b/tools/perf/builtin-probe.c @@ -173,7 +173,7 @@ static int opt_set_filter(const struct option *opt __used, strfilter__delete(params.filter); params.filter = strfilter__new(str, &err); if (!params.filter) { - pr_err("Filter parse error at %ld.\n", err - str + 1); + pr_err("Filter parse error at %td.\n", err - str + 1); pr_err("Source: \"%s\"\n", str); pr_err(" %*c\n", (int)(err - str + 1), '^'); return -EINVAL; From f6bbc1daac964da551130dbf01809d3fbd178b2d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 31 Jan 2011 20:56:27 -0200 Subject: [PATCH 127/706] perf python: Fix build on 32-bit Where there are lots of errors related to python methods receiving 'char *' for things like file open mode, which break the build, also disable strict aliasing and fixup some other warnings. Now builds on both 32-bit and 64-bit fedora systems. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/python.c | 52 ++++++++++++++++++++-------------------- tools/perf/util/setup.py | 3 ++- 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index d2d52175362e..5317ef229041 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -15,6 +15,8 @@ struct throttle_event { u64 stream_id; }; +PyMODINIT_FUNC initperf(void); + #define member_def(type, member, ptype, help) \ { #member, ptype, \ offsetof(struct pyrf_event, event) + offsetof(struct type, member), \ @@ -31,17 +33,15 @@ struct pyrf_event { union perf_event event; }; -#define T_ULONG_LONG T_ULONG - #define sample_members \ - sample_member_def(sample_ip, ip, T_ULONG_LONG, "event type"), \ + sample_member_def(sample_ip, ip, T_ULONGLONG, "event type"), \ sample_member_def(sample_pid, pid, T_INT, "event pid"), \ sample_member_def(sample_tid, tid, T_INT, "event tid"), \ - sample_member_def(sample_time, time, T_ULONG_LONG, "event timestamp"), \ - sample_member_def(sample_addr, addr, T_ULONG_LONG, "event addr"), \ - sample_member_def(sample_id, id, T_ULONG_LONG, "event id"), \ - sample_member_def(sample_stream_id, stream_id, T_ULONG_LONG, "event stream id"), \ - sample_member_def(sample_period, period, T_ULONG_LONG, "event period"), \ + sample_member_def(sample_time, time, T_ULONGLONG, "event timestamp"), \ + sample_member_def(sample_addr, addr, T_ULONGLONG, "event addr"), \ + sample_member_def(sample_id, id, T_ULONGLONG, "event id"), \ + sample_member_def(sample_stream_id, stream_id, T_ULONGLONG, "event stream id"), \ + sample_member_def(sample_period, period, T_ULONGLONG, "event period"), \ sample_member_def(sample_cpu, cpu, T_UINT, "event cpu"), static char pyrf_mmap_event__doc[] = PyDoc_STR("perf mmap event object."); @@ -51,11 +51,11 @@ static PyMemberDef pyrf_mmap_event__members[] = { member_def(perf_event_header, type, T_UINT, "event type"), member_def(mmap_event, pid, T_UINT, "event pid"), member_def(mmap_event, tid, T_UINT, "event tid"), - member_def(mmap_event, start, T_ULONG_LONG, "start of the map"), - member_def(mmap_event, len, T_ULONG_LONG, "map length"), - member_def(mmap_event, pgoff, T_ULONG_LONG, "page offset"), + member_def(mmap_event, start, T_ULONGLONG, "start of the map"), + member_def(mmap_event, len, T_ULONGLONG, "map length"), + member_def(mmap_event, pgoff, T_ULONGLONG, "page offset"), member_def(mmap_event, filename, T_STRING_INPLACE, "backing store"), - { NULL, }, + { .name = NULL, }, }; static PyObject *pyrf_mmap_event__repr(struct pyrf_event *pevent) @@ -96,8 +96,8 @@ static PyMemberDef pyrf_task_event__members[] = { member_def(fork_event, ppid, T_UINT, "event ppid"), member_def(fork_event, tid, T_UINT, "event tid"), member_def(fork_event, ptid, T_UINT, "event ptid"), - member_def(fork_event, time, T_ULONG_LONG, "timestamp"), - { NULL, }, + member_def(fork_event, time, T_ULONGLONG, "timestamp"), + { .name = NULL, }, }; static PyObject *pyrf_task_event__repr(struct pyrf_event *pevent) @@ -130,7 +130,7 @@ static PyMemberDef pyrf_comm_event__members[] = { member_def(comm_event, pid, T_UINT, "event pid"), member_def(comm_event, tid, T_UINT, "event tid"), member_def(comm_event, comm, T_STRING_INPLACE, "process name"), - { NULL, }, + { .name = NULL, }, }; static PyObject *pyrf_comm_event__repr(struct pyrf_event *pevent) @@ -156,10 +156,10 @@ static char pyrf_throttle_event__doc[] = PyDoc_STR("perf throttle event object." static PyMemberDef pyrf_throttle_event__members[] = { sample_members member_def(perf_event_header, type, T_UINT, "event type"), - member_def(throttle_event, time, T_ULONG_LONG, "timestamp"), - member_def(throttle_event, id, T_ULONG_LONG, "event id"), - member_def(throttle_event, stream_id, T_ULONG_LONG, "event stream id"), - { NULL, }, + member_def(throttle_event, time, T_ULONGLONG, "timestamp"), + member_def(throttle_event, id, T_ULONGLONG, "event id"), + member_def(throttle_event, stream_id, T_ULONGLONG, "event stream id"), + { .name = NULL, }, }; static PyObject *pyrf_throttle_event__repr(struct pyrf_event *pevent) @@ -522,7 +522,7 @@ static PyMethodDef pyrf_evsel__methods[] = { .ml_flags = METH_VARARGS | METH_KEYWORDS, .ml_doc = PyDoc_STR("open the event selector file descriptor table.") }, - { NULL, } + { .ml_name = NULL, } }; static char pyrf_evsel__doc[] = PyDoc_STR("perf event selector list object."); @@ -551,7 +551,7 @@ struct pyrf_evlist { }; static int pyrf_evlist__init(struct pyrf_evlist *pevlist, - PyObject *args, PyObject *kwargs) + PyObject *args, PyObject *kwargs __used) { PyObject *pcpus = NULL, *pthreads = NULL; struct cpu_map *cpus; @@ -613,7 +613,7 @@ static PyObject *pyrf_evlist__poll(struct pyrf_evlist *pevlist, } static PyObject *pyrf_evlist__get_pollfd(struct pyrf_evlist *pevlist, - PyObject *args, PyObject *kwargs) + PyObject *args __used, PyObject *kwargs __used) { struct perf_evlist *evlist = &pevlist->evlist; PyObject *list = PyList_New(0); @@ -645,7 +645,7 @@ free_list: static PyObject *pyrf_evlist__add(struct pyrf_evlist *pevlist, - PyObject *args, PyObject *kwargs) + PyObject *args, PyObject *kwargs __used) { struct perf_evlist *evlist = &pevlist->evlist; PyObject *pevsel; @@ -724,7 +724,7 @@ static PyMethodDef pyrf_evlist__methods[] = { .ml_flags = METH_VARARGS | METH_KEYWORDS, .ml_doc = PyDoc_STR("reads an event.") }, - { NULL, } + { .ml_name = NULL, } }; static Py_ssize_t pyrf_evlist__length(PyObject *obj) @@ -840,11 +840,11 @@ static struct { { "RECORD_FORK", PERF_RECORD_FORK }, { "RECORD_READ", PERF_RECORD_READ }, { "RECORD_SAMPLE", PERF_RECORD_SAMPLE }, - { NULL, }, + { .name = NULL, }, }; static PyMethodDef perf__methods[] = { - { NULL, NULL } + { .ml_name = NULL, } }; PyMODINIT_FUNC initperf(void) diff --git a/tools/perf/util/setup.py b/tools/perf/util/setup.py index 496d7f432fb8..1947b0430c94 100644 --- a/tools/perf/util/setup.py +++ b/tools/perf/util/setup.py @@ -6,7 +6,8 @@ perf = Extension('perf', sources = ['util/python.c', 'util/ctype.c', 'util/evlist.c', 'util/evsel.c', 'util/cpumap.c', 'util/thread_map.c', 'util/util.c', 'util/xyarray.c'], - include_dirs = ['util/include']) + include_dirs = ['util/include'], + extra_compile_args = ['-fno-strict-aliasing', '-Wno-write-strings']) setup(name='perf', version='0.1', From 7cf37e87dd2cfa17a64f28ea7f31eed4525f79e4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 09:34:58 +0100 Subject: [PATCH 128/706] time: Fix legacy arch fallout The xtime/dotimer cleanup broke architectures which do not implement clockevents. Time to send out another __do_IRQ threat. Signed-off-by: Thomas Gleixner Reported-by: Ingo Molnar Cc: Torben Hohn Cc: Peter Zijlstra Cc: johnstul@us.ibm.com Cc: yong.zhang0@gmail.com Cc: hch@infradead.org LKML-Reference: <20110127145905.23248.30458.stgit@localhost> Signed-off-by: Ingo Molnar --- kernel/time/tick-internal.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 28c578568c9d..f77b93df0006 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -4,6 +4,8 @@ #include #include +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD + #define TICK_DO_TIMER_NONE -1 #define TICK_DO_TIMER_BOOT -2 @@ -135,5 +137,7 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) return !(dev->features & CLOCK_EVT_FEAT_DUMMY); } +#endif + extern void do_timer(unsigned long ticks); extern seqlock_t xtime_lock; From 067187fc9f1d09738fc833392e117f125cb6bbad Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 1 Feb 2011 14:57:02 -0200 Subject: [PATCH 129/706] perf tools: Remove verbose build messages for the python binding Also now it builds it in a well known location: [acme@felicio linux]$ rm -rf ../build/perf/ [acme@felicio linux]$ mkdir ../build/perf [acme@felicio linux]$ make -j2 O=~acme/git/build/perf -C tools/perf/ [acme@felicio linux]$ ls -la ../build/perf/python/ total 152 -rwxrwxr-x 1 acme acme 147957 Feb 1 14:56 perf.so drwxrwxr-x 3 acme acme 17 Feb 1 14:56 temp [acme@felicio linux]$ [root@felicio ~]# strip ~acme/git/build/perf/python/perf.so [root@felicio ~]# ls -la ~acme/git/build/perf/python/perf.so -rwxrwxr-x 1 acme acme 46264 Feb 1 14:58 /home/acme/git/build/perf/python/perf.so [root@felicio ~]# export PYTHONPATH=~acme/git/build/perf/python/ [root@felicio ~]# ~acme/git/linux/tools/perf/python/twatch.py cpu: 0, pid: 7751, tid: 7751 { type: exit, pid: 7751, ppid: 7751, tid: 7751, ptid: 7751, time: 54562393512356} cpu: 0, pid: 13700, tid: 13700 { type: fork, pid: 7756, ppid: 13700, tid: 7756, ptid: 13700, time: 54562393746739} cpu: 1, pid: 7756, tid: 7756 { type: fork, pid: 7757, ppid: 7756, tid: 7757, ptid: 7756, time: 54562394246152} cpu: 1, pid: 7757, tid: 7757 { type: comm, pid: 7757, tid: 7757, comm: awk } cpu: 1, pid: 7757, tid: 7757 { type: exit, pid: 7757, ppid: 7757, tid: 7757, ptid: 7757, time: 54562395456813} Reported-by: Ingo Molnar Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tools/perf/Makefile b/tools/perf/Makefile index 67a9f4d805de..d1984ee5df69 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -325,9 +325,9 @@ SCRIPT_SH += perf-archive.sh grep-libs = $(filter -l%,$(1)) strip-libs = $(filter-out -l%,$(1)) -pyrf: $(PYRF_OBJS) - python util/setup.py build --build-base='$(OUTPUT)' - +$(OUTPUT)python/perf.so: $(PYRF_OBJS) + @python util/setup.py --quiet build_ext --build-lib='$(OUTPUT)python' \ + --build-temp='$(OUTPUT)python/temp' # # No Perl scripts right now: # @@ -348,12 +348,14 @@ PROGRAMS += $(EXTRA_PROGRAMS) # PROGRAMS += $(OUTPUT)perf +LANG_BINDINGS = $(OUTPUT)python/perf.so + # List built-in command $C whose implementation cmd_$C() is not in # builtin-$C.o but is linked in as part of some other command. # # what 'all' will build and 'install' will install, in perfexecdir -ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS) pyrf +ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS) $(LANG_BINDINGS) # what 'all' will build but not install in perfexecdir OTHER_PROGRAMS = $(OUTPUT)perf$X @@ -1298,6 +1300,8 @@ clean: $(RM) $(htmldocs).tar.gz $(manpages).tar.gz $(MAKE) -C Documentation/ clean $(RM) $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)PERF-CFLAGS $(OUTPUT)PERF-BUILD-OPTIONS + @python util/setup.py clean --build-lib='$(OUTPUT)python' \ + --build-temp='$(OUTPUT)python/temp' .PHONY: all install clean strip .PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell From 568bb7b8e856b9efb98a3f63259c717adc1b96b8 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 1 Feb 2011 15:05:00 -0200 Subject: [PATCH 130/706] perf tools: Fix up 'make clean' target It wasn't using $(OUTPUT) to rm *.o and there were some funny looking automake files that never get created but were being deleted anyway. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/perf/Makefile b/tools/perf/Makefile index d1984ee5df69..85f654927b92 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -1289,12 +1289,10 @@ distclean: clean # $(RM) configure clean: - $(RM) *.o */*.o */*/*.o */*/*/*.o $(LIB_FILE) + $(RM) $(OUTPUT){*.o,*/*.o,*/*/*.o,*/*/*/*.o,$(LIB_FILE),perf-archive} $(RM) $(ALL_PROGRAMS) $(BUILT_INS) perf$X $(RM) $(TEST_PROGRAMS) $(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo $(OUTPUT)common-cmds.h TAGS tags cscope* - $(RM) -r autom4te.cache - $(RM) config.log config.mak.autogen config.mak.append config.status config.cache $(RM) -r $(PERF_TARNAME) .doc-tmp-dir $(RM) $(PERF_TARNAME).tar.gz perf-core_$(PERF_VERSION)-*.tar.gz $(RM) $(htmldocs).tar.gz $(manpages).tar.gz From 0015e2e101f5fd3256ab8b5a374c0e8806098871 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 1 Feb 2011 16:18:10 -0200 Subject: [PATCH 131/706] perf stat: Fix up resource release order That was causing a SEGV on selected old distros. Problem introduced in 7e2ed09. Reported-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index e0f95755361b..806a9998fcd5 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -748,8 +748,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used) out_free_fd: list_for_each_entry(pos, &evsel_list->entries, node) perf_evsel__free_stat_priv(pos); - perf_evlist__delete(evsel_list); -out: perf_evlist__delete_maps(evsel_list); +out: + perf_evlist__delete(evsel_list); return status; } From 978f626c4e5b9524d1898788d8e34d86dfa00795 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 1 Feb 2011 16:40:51 -0200 Subject: [PATCH 132/706] perf tools: Don't try to build python bindings if Python.h not available Just leverage the test done for python support in 'python script', emitting a warning about losing those features if python-dev[el] is not installed. Reported-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/perf/Makefile b/tools/perf/Makefile index 85f654927b92..4c9499cb4398 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -348,14 +348,14 @@ PROGRAMS += $(EXTRA_PROGRAMS) # PROGRAMS += $(OUTPUT)perf -LANG_BINDINGS = $(OUTPUT)python/perf.so +LANG_BINDINGS = # List built-in command $C whose implementation cmd_$C() is not in # builtin-$C.o but is linked in as part of some other command. # # what 'all' will build and 'install' will install, in perfexecdir -ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS) $(LANG_BINDINGS) +ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS) # what 'all' will build but not install in perfexecdir OTHER_PROGRAMS = $(OUTPUT)perf$X @@ -664,12 +664,14 @@ else PYTHON_EMBED_CCOPTS = `python-config --cflags 2>/dev/null` FLAGS_PYTHON_EMBED=$(PYTHON_EMBED_CCOPTS) $(PYTHON_EMBED_LDOPTS) ifneq ($(call try-cc,$(SOURCE_PYTHON_EMBED),$(FLAGS_PYTHON_EMBED)),y) + msg := $(warning No Python.h found, install python-dev[el] to have python support in 'perf script' and to build the python bindings) BASIC_CFLAGS += -DNO_LIBPYTHON else ALL_LDFLAGS += $(PYTHON_EMBED_LDFLAGS) EXTLIBS += $(PYTHON_EMBED_LIBADD) LIB_OBJS += $(OUTPUT)util/scripting-engines/trace-event-python.o LIB_OBJS += $(OUTPUT)scripts/python/Perf-Trace-Util/Context.o + LANG_BINDINGS += $(OUTPUT)python/perf.so endif endif @@ -956,7 +958,7 @@ export TAR INSTALL DESTDIR SHELL_PATH SHELL = $(SHELL_PATH) -all:: shell_compatibility_test $(ALL_PROGRAMS) $(BUILT_INS) $(OTHER_PROGRAMS) $(OUTPUT)PERF-BUILD-OPTIONS +all:: shell_compatibility_test $(ALL_PROGRAMS) $(LANG_BINDINGS) $(BUILT_INS) $(OTHER_PROGRAMS) $(OUTPUT)PERF-BUILD-OPTIONS ifneq (,$X) $(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), test '$p' -ef '$p$X' || $(RM) '$p';) endif From cdb0861c85c03fe80f4da033aab69df949579dc6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 1 Feb 2011 10:51:23 -0800 Subject: [PATCH 133/706] perf top: Fix TUI compilation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit > + slsmg_write_nstring(width >= syme->map->dso->long_name_len ? > + syme->map->dso->long_name : > + syme->map->dso->short_name, width); need update macro for that calling util/ui/browsers/top.c: In function ‘perf_top_browser__write’: util/ui/browsers/top.c:60:2: error: cast to pointer from integer of different size util/ui/browsers/top.c:60:2: error: comparison between pointer and integer util/ui/browsers/top.c:60:2: error: passing argument 1 of ‘SLsmg_write_nstring’ discards qualifiers from pointer target type /usr/include/slang.h:1728:16: note: expected ‘char *’ but argument is of type ‘const char *’ make: *** [util/ui/browsers/top.o] Error 1 Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Tom Zanussi LKML-Reference: <4D48562B.20006@kernel.org> Signed-off-by: Yinghai Lu Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/ui/libslang.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/ui/libslang.h b/tools/perf/util/ui/libslang.h index 5623da8e8080..2b63e1c9b181 100644 --- a/tools/perf/util/ui/libslang.h +++ b/tools/perf/util/ui/libslang.h @@ -13,11 +13,11 @@ #if SLANG_VERSION < 20104 #define slsmg_printf(msg, args...) \ - SLsmg_printf((char *)msg, ##args) + SLsmg_printf((char *)(msg), ##args) #define slsmg_write_nstring(msg, len) \ - SLsmg_write_nstring((char *)msg, len) + SLsmg_write_nstring((char *)(msg), len) #define sltt_set_color(obj, name, fg, bg) \ - SLtt_set_color(obj,(char *)name, (char *)fg, (char *)bg) + SLtt_set_color(obj,(char *)(name), (char *)(fg), (char *)(bg)) #else #define slsmg_printf SLsmg_printf #define slsmg_write_nstring SLsmg_write_nstring From 1e6d767924c74929c0cfe839ae8f37bcee9e544e Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Tue, 1 Feb 2011 13:50:58 +0000 Subject: [PATCH 134/706] time: Correct the *settime* parameters Both settimeofday() and clock_settime() promise with a 'const' attribute not to alter the arguments passed in. This patch adds the missing 'const' attribute into the various kernel functions implementing these calls. Signed-off-by: Richard Cochran Acked-by: John Stultz LKML-Reference: <20110201134417.545698637@linutronix.de> Signed-off-by: Thomas Gleixner --- drivers/char/mmtimer.c | 2 +- include/linux/posix-timers.h | 5 +++-- include/linux/security.h | 9 +++++---- include/linux/time.h | 5 +++-- kernel/posix-timers.c | 4 ++-- kernel/time.c | 2 +- kernel/time/timekeeping.c | 2 +- security/commoncap.c | 2 +- security/security.c | 2 +- 9 files changed, 18 insertions(+), 15 deletions(-) diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c index e6d75627c6c8..ecd0082502ef 100644 --- a/drivers/char/mmtimer.c +++ b/drivers/char/mmtimer.c @@ -487,7 +487,7 @@ static int sgi_clock_get(clockid_t clockid, struct timespec *tp) return 0; }; -static int sgi_clock_set(clockid_t clockid, struct timespec *tp) +static int sgi_clock_set(const clockid_t clockid, const struct timespec *tp) { u64 nsec; diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 3e23844a6990..b2c14cbd47a6 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -69,7 +69,8 @@ struct k_itimer { struct k_clock { int res; /* in nanoseconds */ int (*clock_getres) (const clockid_t which_clock, struct timespec *tp); - int (*clock_set) (const clockid_t which_clock, struct timespec * tp); + int (*clock_set) (const clockid_t which_clock, + const struct timespec *tp); int (*clock_get) (const clockid_t which_clock, struct timespec * tp); int (*timer_create) (struct k_itimer *timer); int (*nsleep) (const clockid_t which_clock, int flags, @@ -89,7 +90,7 @@ void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock); /* error handlers for timer_create, nanosleep and settime */ int do_posix_clock_nonanosleep(const clockid_t, int flags, struct timespec *, struct timespec __user *); -int do_posix_clock_nosettime(const clockid_t, struct timespec *tp); +int do_posix_clock_nosettime(const clockid_t, const struct timespec *tp); /* function to call to trigger timer event */ int posix_timer_event(struct k_itimer *timr, int si_private); diff --git a/include/linux/security.h b/include/linux/security.h index c642bb8b8f5a..c096aa6fca60 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -53,7 +53,7 @@ struct audit_krule; */ extern int cap_capable(struct task_struct *tsk, const struct cred *cred, int cap, int audit); -extern int cap_settime(struct timespec *ts, struct timezone *tz); +extern int cap_settime(const struct timespec *ts, const struct timezone *tz); extern int cap_ptrace_access_check(struct task_struct *child, unsigned int mode); extern int cap_ptrace_traceme(struct task_struct *parent); extern int cap_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); @@ -1387,7 +1387,7 @@ struct security_operations { int (*quotactl) (int cmds, int type, int id, struct super_block *sb); int (*quota_on) (struct dentry *dentry); int (*syslog) (int type); - int (*settime) (struct timespec *ts, struct timezone *tz); + int (*settime) (const struct timespec *ts, const struct timezone *tz); int (*vm_enough_memory) (struct mm_struct *mm, long pages); int (*bprm_set_creds) (struct linux_binprm *bprm); @@ -1669,7 +1669,7 @@ int security_sysctl(struct ctl_table *table, int op); int security_quotactl(int cmds, int type, int id, struct super_block *sb); int security_quota_on(struct dentry *dentry); int security_syslog(int type); -int security_settime(struct timespec *ts, struct timezone *tz); +int security_settime(const struct timespec *ts, const struct timezone *tz); int security_vm_enough_memory(long pages); int security_vm_enough_memory_mm(struct mm_struct *mm, long pages); int security_vm_enough_memory_kern(long pages); @@ -1904,7 +1904,8 @@ static inline int security_syslog(int type) return 0; } -static inline int security_settime(struct timespec *ts, struct timezone *tz) +static inline int security_settime(const struct timespec *ts, + const struct timezone *tz) { return cap_settime(ts, tz); } diff --git a/include/linux/time.h b/include/linux/time.h index 38c5206c2673..7c44e7778033 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -145,8 +145,9 @@ static inline u32 arch_gettimeoffset(void) { return 0; } #endif extern void do_gettimeofday(struct timeval *tv); -extern int do_settimeofday(struct timespec *tv); -extern int do_sys_settimeofday(struct timespec *tv, struct timezone *tz); +extern int do_settimeofday(const struct timespec *tv); +extern int do_sys_settimeofday(const struct timespec *tv, + const struct timezone *tz); #define do_posix_clock_monotonic_gettime(ts) ktime_get_ts(ts) extern long do_utimes(int dfd, const char __user *filename, struct timespec *times, int flags); struct itimerval; diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 93bd2eb2bc53..21b7ca205f38 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -192,7 +192,7 @@ static int common_clock_get(clockid_t which_clock, struct timespec *tp) } static inline int common_clock_set(const clockid_t which_clock, - struct timespec *tp) + const struct timespec *tp) { return do_sys_settimeofday(tp, NULL); } @@ -928,7 +928,7 @@ void exit_itimers(struct signal_struct *sig) } /* Not available / possible... functions */ -int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp) +int do_posix_clock_nosettime(const clockid_t clockid, const struct timespec *tp) { return -EINVAL; } diff --git a/kernel/time.c b/kernel/time.c index a31b51220ac6..5cb80533d8b5 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -150,7 +150,7 @@ static inline void warp_clock(void) * various programs will get confused when the clock gets warped. */ -int do_sys_settimeofday(struct timespec *tv, struct timezone *tz) +int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz) { static int firsttime = 1; int error = 0; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 02c13a313d15..4f9f65b91323 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -353,7 +353,7 @@ EXPORT_SYMBOL(do_gettimeofday); * * Sets the time of day to the new time and update NTP and notify hrtimers */ -int do_settimeofday(struct timespec *tv) +int do_settimeofday(const struct timespec *tv) { struct timespec ts_delta; unsigned long flags; diff --git a/security/commoncap.c b/security/commoncap.c index 64c2ed9c9015..dbfdaed4cc66 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -93,7 +93,7 @@ int cap_capable(struct task_struct *tsk, const struct cred *cred, int cap, * Determine whether the current process may set the system clock and timezone * information, returning 0 if permission granted, -ve if denied. */ -int cap_settime(struct timespec *ts, struct timezone *tz) +int cap_settime(const struct timespec *ts, const struct timezone *tz) { if (!capable(CAP_SYS_TIME)) return -EPERM; diff --git a/security/security.c b/security/security.c index 739e40362f44..b995428f1c96 100644 --- a/security/security.c +++ b/security/security.c @@ -202,7 +202,7 @@ int security_syslog(int type) return security_ops->syslog(type); } -int security_settime(struct timespec *ts, struct timezone *tz) +int security_settime(const struct timespec *ts, const struct timezone *tz) { return security_ops->settime(ts, tz); } From 65da528d7cc94966cf24d2a1e0837b689159b543 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:51:01 +0000 Subject: [PATCH 135/706] posix-timers: Define nanosleep not supported error separate Define the conditional nanosleep not supported error value outside of do_posix_clock_nonanosleep(). Preparatory patch for further cleanups. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134417.643486574@linutronix.de> --- kernel/posix-timers.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 21b7ca205f38..89bff3766d7d 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -81,6 +81,14 @@ static DEFINE_SPINLOCK(idr_lock); #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" #endif +/* + * parisc wants ENOTSUP instead of EOPNOTSUPP + */ +#ifndef ENOTSUP +# define ENANOSLEEP_NOTSUP EOPNOTSUPP +#else +# define ENANOSLEEP_NOTSUP ENOTSUP +#endif /* * The timer ID is turned into a timer address by idr_find(). @@ -937,11 +945,7 @@ EXPORT_SYMBOL_GPL(do_posix_clock_nosettime); int do_posix_clock_nonanosleep(const clockid_t clock, int flags, struct timespec *t, struct timespec __user *r) { -#ifndef ENOTSUP - return -EOPNOTSUPP; /* aka ENOTSUP in userland for POSIX */ -#else /* parisc does define it separately. */ - return -ENOTSUP; -#endif + return -ENANOSLEEP_NOTSUP; } EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep); From 2fd1f04089cb657c5d6c484b280ec4d3398aa157 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:51:03 +0000 Subject: [PATCH 136/706] posix-timers: Cleanup struct initializers Cosmetic. No functional change Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134417.745627057@linutronix.de> --- drivers/char/mmtimer.c | 14 +++++++------- kernel/posix-cpu-timers.c | 24 ++++++++++++------------ kernel/posix-timers.c | 38 +++++++++++++++++++------------------- 3 files changed, 38 insertions(+), 38 deletions(-) diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c index ecd0082502ef..fd51cd8ee063 100644 --- a/drivers/char/mmtimer.c +++ b/drivers/char/mmtimer.c @@ -765,13 +765,13 @@ static int sgi_timer_set(struct k_itimer *timr, int flags, static struct k_clock sgi_clock = { .res = 0, - .clock_set = sgi_clock_set, - .clock_get = sgi_clock_get, - .timer_create = sgi_timer_create, - .nsleep = do_posix_clock_nonanosleep, - .timer_set = sgi_timer_set, - .timer_del = sgi_timer_del, - .timer_get = sgi_timer_get + .clock_set = sgi_clock_set, + .clock_get = sgi_clock_get, + .timer_create = sgi_timer_create, + .nsleep = do_posix_clock_nonanosleep, + .timer_set = sgi_timer_set, + .timer_del = sgi_timer_del, + .timer_get = sgi_timer_get }; /** diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 05bb7173850e..11b91dc0992b 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1607,20 +1607,20 @@ static long thread_cpu_nsleep_restart(struct restart_block *restart_block) static __init int init_posix_cpu_timers(void) { struct k_clock process = { - .clock_getres = process_cpu_clock_getres, - .clock_get = process_cpu_clock_get, - .clock_set = do_posix_clock_nosettime, - .timer_create = process_cpu_timer_create, - .nsleep = process_cpu_nsleep, - .nsleep_restart = process_cpu_nsleep_restart, + .clock_getres = process_cpu_clock_getres, + .clock_get = process_cpu_clock_get, + .clock_set = do_posix_clock_nosettime, + .timer_create = process_cpu_timer_create, + .nsleep = process_cpu_nsleep, + .nsleep_restart = process_cpu_nsleep_restart, }; struct k_clock thread = { - .clock_getres = thread_cpu_clock_getres, - .clock_get = thread_cpu_clock_get, - .clock_set = do_posix_clock_nosettime, - .timer_create = thread_cpu_timer_create, - .nsleep = thread_cpu_nsleep, - .nsleep_restart = thread_cpu_nsleep_restart, + .clock_getres = thread_cpu_clock_getres, + .clock_get = thread_cpu_clock_get, + .clock_set = do_posix_clock_nosettime, + .timer_create = thread_cpu_timer_create, + .nsleep = thread_cpu_nsleep, + .nsleep_restart = thread_cpu_nsleep_restart, }; struct timespec ts; diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 89bff3766d7d..e7d26afd8ee5 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -281,33 +281,33 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp static __init int init_posix_timers(void) { struct k_clock clock_realtime = { - .clock_getres = hrtimer_get_res, + .clock_getres = hrtimer_get_res, }; struct k_clock clock_monotonic = { - .clock_getres = hrtimer_get_res, - .clock_get = posix_ktime_get_ts, - .clock_set = do_posix_clock_nosettime, + .clock_getres = hrtimer_get_res, + .clock_get = posix_ktime_get_ts, + .clock_set = do_posix_clock_nosettime, }; struct k_clock clock_monotonic_raw = { - .clock_getres = hrtimer_get_res, - .clock_get = posix_get_monotonic_raw, - .clock_set = do_posix_clock_nosettime, - .timer_create = no_timer_create, - .nsleep = no_nsleep, + .clock_getres = hrtimer_get_res, + .clock_get = posix_get_monotonic_raw, + .clock_set = do_posix_clock_nosettime, + .timer_create = no_timer_create, + .nsleep = no_nsleep, }; struct k_clock clock_realtime_coarse = { - .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_realtime_coarse, - .clock_set = do_posix_clock_nosettime, - .timer_create = no_timer_create, - .nsleep = no_nsleep, + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_realtime_coarse, + .clock_set = do_posix_clock_nosettime, + .timer_create = no_timer_create, + .nsleep = no_nsleep, }; struct k_clock clock_monotonic_coarse = { - .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_monotonic_coarse, - .clock_set = do_posix_clock_nosettime, - .timer_create = no_timer_create, - .nsleep = no_nsleep, + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_monotonic_coarse, + .clock_set = do_posix_clock_nosettime, + .timer_create = no_timer_create, + .nsleep = no_nsleep, }; register_posix_clock(CLOCK_REALTIME, &clock_realtime); From 1976945eeaab5fa461735a6225a82c3cf1e65d62 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:51:06 +0000 Subject: [PATCH 137/706] posix-timers: Introduce clock_posix_cpu The CLOCK_DISPATCH() macro is a horrible magic. We call common functions if a function pointer is not set. That's just backwards. To support dynamic file decriptor based clocks we need to cleanup that dispatch logic. Create a k_clock struct clock_posix_cpu which has all the posix-cpu-timer functions filled in. After the cleanup the functions can be made static. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134417.841974553@linutronix.de> --- include/linux/posix-timers.h | 2 ++ kernel/posix-cpu-timers.c | 12 ++++++++++++ 2 files changed, 14 insertions(+) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index b2c14cbd47a6..1330ff331526 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -85,6 +85,8 @@ struct k_clock { struct itimerspec * cur_setting); }; +extern struct k_clock clock_posix_cpu; + void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock); /* error handlers for timer_create, nanosleep and settime */ diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 11b91dc0992b..816cd49a5ad9 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1604,6 +1604,18 @@ static long thread_cpu_nsleep_restart(struct restart_block *restart_block) return -EINVAL; } +struct k_clock clock_posix_cpu = { + .clock_getres = posix_cpu_clock_getres, + .clock_set = posix_cpu_clock_set, + .clock_get = posix_cpu_clock_get, + .timer_create = posix_cpu_timer_create, + .nsleep = posix_cpu_nsleep, + .nsleep_restart = posix_cpu_nsleep_restart, + .timer_set = posix_cpu_timer_set, + .timer_del = posix_cpu_timer_del, + .timer_get = posix_cpu_timer_get, +}; + static __init int init_posix_cpu_timers(void) { struct k_clock process = { From cc785ac22b17ed53e8ff5c1501e422be6d10be3c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:51:09 +0000 Subject: [PATCH 138/706] posix-timers: Introduce clockid_to_kclock() New function to find the kclock for a given clockid. Returns a pointer to clock_posix_cpu if clockid < 0. If clockid >= MAXCLOCK or if the clock_getres pointer is not set it returns NULL. For valid clocks it returns a pointer to the matching posix_clock. Signed-off-by: Thomas Gleixner Cc: John Stultz Acked-by: Richard Cochran LKML-Reference: <20110201134417.938447839@linutronix.de> --- kernel/posix-timers.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index e7d26afd8ee5..14b0a70ffb1e 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -531,6 +531,16 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) kmem_cache_free(posix_timers_cache, tmr); } +static struct k_clock *clockid_to_kclock(const clockid_t id) +{ + if (id < 0) + return &clock_posix_cpu; + + if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres) + return NULL; + return &posix_clocks[id]; +} + /* Create a POSIX.1b interval timer. */ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, From a5cd2880106cb2c79b3fe24f1c53dadba6a542a0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:51:11 +0000 Subject: [PATCH 139/706] posix-timers: Convert clock_nanosleep to clockid_to_kclock() Use the new kclock decoding function in clock_nanosleep and cleanup all kclocks which use the default functions. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134418.034175556@linutronix.de> --- drivers/char/mmtimer.c | 1 - include/linux/posix-timers.h | 2 -- kernel/posix-timers.c | 26 +++++++------------------- 3 files changed, 7 insertions(+), 22 deletions(-) diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c index fd51cd8ee063..262d10453cb8 100644 --- a/drivers/char/mmtimer.c +++ b/drivers/char/mmtimer.c @@ -768,7 +768,6 @@ static struct k_clock sgi_clock = { .clock_set = sgi_clock_set, .clock_get = sgi_clock_get, .timer_create = sgi_timer_create, - .nsleep = do_posix_clock_nonanosleep, .timer_set = sgi_timer_set, .timer_del = sgi_timer_del, .timer_get = sgi_timer_get diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 1330ff331526..cd6da067bce1 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -90,8 +90,6 @@ extern struct k_clock clock_posix_cpu; void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock); /* error handlers for timer_create, nanosleep and settime */ -int do_posix_clock_nonanosleep(const clockid_t, int flags, struct timespec *, - struct timespec __user *); int do_posix_clock_nosettime(const clockid_t, const struct timespec *tp); /* function to call to trigger timer event */ diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 14b0a70ffb1e..ee69b216d5c3 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -216,12 +216,6 @@ static int no_timer_create(struct k_itimer *new_timer) return -EOPNOTSUPP; } -static int no_nsleep(const clockid_t which_clock, int flags, - struct timespec *tsave, struct timespec __user *rmtp) -{ - return -EOPNOTSUPP; -} - /* * Return nonzero if we know a priori this clockid_t value is bogus. */ @@ -282,32 +276,31 @@ static __init int init_posix_timers(void) { struct k_clock clock_realtime = { .clock_getres = hrtimer_get_res, + .nsleep = common_nsleep, }; struct k_clock clock_monotonic = { .clock_getres = hrtimer_get_res, .clock_get = posix_ktime_get_ts, .clock_set = do_posix_clock_nosettime, + .nsleep = common_nsleep, }; struct k_clock clock_monotonic_raw = { .clock_getres = hrtimer_get_res, .clock_get = posix_get_monotonic_raw, .clock_set = do_posix_clock_nosettime, .timer_create = no_timer_create, - .nsleep = no_nsleep, }; struct k_clock clock_realtime_coarse = { .clock_getres = posix_get_coarse_res, .clock_get = posix_get_realtime_coarse, .clock_set = do_posix_clock_nosettime, .timer_create = no_timer_create, - .nsleep = no_nsleep, }; struct k_clock clock_monotonic_coarse = { .clock_getres = posix_get_coarse_res, .clock_get = posix_get_monotonic_coarse, .clock_set = do_posix_clock_nosettime, .timer_create = no_timer_create, - .nsleep = no_nsleep, }; register_posix_clock(CLOCK_REALTIME, &clock_realtime); @@ -952,13 +945,6 @@ int do_posix_clock_nosettime(const clockid_t clockid, const struct timespec *tp) } EXPORT_SYMBOL_GPL(do_posix_clock_nosettime); -int do_posix_clock_nonanosleep(const clockid_t clock, int flags, - struct timespec *t, struct timespec __user *r) -{ - return -ENANOSLEEP_NOTSUP; -} -EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep); - SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, const struct timespec __user *, tp) { @@ -1023,10 +1009,13 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, const struct timespec __user *, rqtp, struct timespec __user *, rmtp) { + struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec t; - if (invalid_clockid(which_clock)) + if (!kc) return -EINVAL; + if (!kc->nsleep) + return -ENANOSLEEP_NOTSUP; if (copy_from_user(&t, rqtp, sizeof (struct timespec))) return -EFAULT; @@ -1034,8 +1023,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, if (!timespec_valid(&t)) return -EINVAL; - return CLOCK_DISPATCH(which_clock, nsleep, - (which_clock, flags, &t, rmtp)); + return kc->nsleep(which_clock, flags, &t, rmtp); } /* From 59bd5bc24aa69f6c62da1e242c16f09f667def96 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:51:17 +0000 Subject: [PATCH 140/706] posix-timers: Convert clock_nanosleep_restart to clockid_to_kclock() Use the new kclock decoding function in clock_nanosleep_restart. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134418.131263211@linutronix.de> --- kernel/posix-timers.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index ee69b216d5c3..4dd86d15bbd0 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -277,12 +277,14 @@ static __init int init_posix_timers(void) struct k_clock clock_realtime = { .clock_getres = hrtimer_get_res, .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, }; struct k_clock clock_monotonic = { .clock_getres = hrtimer_get_res, .clock_get = posix_ktime_get_ts, .clock_set = do_posix_clock_nosettime, .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, }; struct k_clock clock_monotonic_raw = { .clock_getres = hrtimer_get_res, @@ -1026,23 +1028,17 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, return kc->nsleep(which_clock, flags, &t, rmtp); } -/* - * nanosleep_restart for monotonic and realtime clocks - */ -static int common_nsleep_restart(struct restart_block *restart_block) -{ - return hrtimer_nanosleep_restart(restart_block); -} - /* * This will restart clock_nanosleep. This is required only by * compat_clock_nanosleep_restart for now. */ -long -clock_nanosleep_restart(struct restart_block *restart_block) +long clock_nanosleep_restart(struct restart_block *restart_block) { clockid_t which_clock = restart_block->arg0; + struct k_clock *kc = clockid_to_kclock(which_clock); - return CLOCK_DISPATCH(which_clock, nsleep_restart, - (restart_block)); + if (WARN_ON_ONCE(!kc || !kc->nsleep_restart)) + return -EINVAL; + + return kc->nsleep_restart(restart_block); } From 3751f9f29bcbc19bd10e92254a273486f150c245 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:51:20 +0000 Subject: [PATCH 141/706] posix-timers: Cleanup restart_block usage posix timers still use the legacy arg0-arg3 members of restart_block. Use restart_block.nanosleep instead Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134418.232288779@linutronix.de> --- kernel/posix-cpu-timers.c | 38 +++++++++++++++----------------------- kernel/posix-timers.c | 2 +- 2 files changed, 16 insertions(+), 24 deletions(-) diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 816cd49a5ad9..9e617b00afa9 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1485,7 +1485,7 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags, struct timespec *rqtp, struct timespec __user *rmtp) { struct restart_block *restart_block = - ¤t_thread_info()->restart_block; + ¤t_thread_info()->restart_block; struct itimerspec it; int error; @@ -1501,50 +1501,42 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags, if (error == -ERESTART_RESTARTBLOCK) { - if (flags & TIMER_ABSTIME) + if (flags & TIMER_ABSTIME) return -ERESTARTNOHAND; /* - * Report back to the user the time still remaining. - */ - if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) + * Report back to the user the time still remaining. + */ + if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) return -EFAULT; restart_block->fn = posix_cpu_nsleep_restart; - restart_block->arg0 = which_clock; - restart_block->arg1 = (unsigned long) rmtp; - restart_block->arg2 = rqtp->tv_sec; - restart_block->arg3 = rqtp->tv_nsec; + restart_block->nanosleep.index = which_clock; + restart_block->nanosleep.rmtp = rmtp; + restart_block->nanosleep.expires = timespec_to_ns(rqtp); } return error; } long posix_cpu_nsleep_restart(struct restart_block *restart_block) { - clockid_t which_clock = restart_block->arg0; - struct timespec __user *rmtp; + clockid_t which_clock = restart_block->nanosleep.index; struct timespec t; struct itimerspec it; int error; - rmtp = (struct timespec __user *) restart_block->arg1; - t.tv_sec = restart_block->arg2; - t.tv_nsec = restart_block->arg3; + t = ns_to_timespec(restart_block->nanosleep.expires); - restart_block->fn = do_no_restart_syscall; error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); if (error == -ERESTART_RESTARTBLOCK) { + struct timespec __user *rmtp = restart_block->nanosleep.rmtp; /* - * Report back to the user the time still remaining. - */ - if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) + * Report back to the user the time still remaining. + */ + if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) return -EFAULT; - restart_block->fn = posix_cpu_nsleep_restart; - restart_block->arg0 = which_clock; - restart_block->arg1 = (unsigned long) rmtp; - restart_block->arg2 = t.tv_sec; - restart_block->arg3 = t.tv_nsec; + restart_block->nanosleep.expires = timespec_to_ns(&t); } return error; diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 4dd86d15bbd0..4762986814c8 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -1034,7 +1034,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, */ long clock_nanosleep_restart(struct restart_block *restart_block) { - clockid_t which_clock = restart_block->arg0; + clockid_t which_clock = restart_block->nanosleep.index; struct k_clock *kc = clockid_to_kclock(which_clock); if (WARN_ON_ONCE(!kc || !kc->nsleep_restart)) From d608c18203a969e5d14572a9861c646d0bb66872 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:51:43 +0000 Subject: [PATCH 142/706] thread_info: Remove legacy arg0-3 from restart_block posix timers were the last users of the legacy arg0-3 members of restart_block. Remove the cruft. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134418.326209775@linutronix.de> --- include/linux/thread_info.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index c90696544176..20fc303947d3 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -18,9 +18,6 @@ struct compat_timespec; struct restart_block { long (*fn)(struct restart_block *); union { - struct { - unsigned long arg0, arg1, arg2, arg3; - }; /* For futex_wait and futex_wait_requeue_pi */ struct { u32 __user *uaddr; From 79c9da0d0539fb341a1b48a2a5a23d974726de90 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:51:45 +0000 Subject: [PATCH 143/706] posix-cpu-timers: Remove the stub nanosleep functions CLOCK_THREAD_CPUTIME_ID implements stub functions for nanosleep and nanosleep_restart, which return -EINVAL. That return value is wrong. The correct return value is -ENOTSUP. Remove the stubs and let the new dispatch code return the correct error code. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134418.422446502@linutronix.de> --- kernel/posix-cpu-timers.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 9e617b00afa9..8dc4cd7faf89 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1586,15 +1586,6 @@ static int thread_cpu_timer_create(struct k_itimer *timer) timer->it_clock = THREAD_CLOCK; return posix_cpu_timer_create(timer); } -static int thread_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, struct timespec __user *rmtp) -{ - return -EINVAL; -} -static long thread_cpu_nsleep_restart(struct restart_block *restart_block) -{ - return -EINVAL; -} struct k_clock clock_posix_cpu = { .clock_getres = posix_cpu_clock_getres, @@ -1623,8 +1614,6 @@ static __init int init_posix_cpu_timers(void) .clock_get = thread_cpu_clock_get, .clock_set = do_posix_clock_nosettime, .timer_create = thread_cpu_timer_create, - .nsleep = thread_cpu_nsleep, - .nsleep_restart = thread_cpu_nsleep_restart, }; struct timespec ts; From 26f9a4796af330173d790c8d2b5e2efcc489e755 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:51:48 +0000 Subject: [PATCH 144/706] posix-timers: Convert clock_settime to clockid_to_kclock() Use the new kclock decoding function in clock_settime and cleanup all kclocks which use the default functions. Rename the misnomed common_clock_set() to posix_clock_realtime_set(). Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134418.518851246@linutronix.de> --- include/linux/posix-timers.h | 3 --- kernel/posix-cpu-timers.c | 2 -- kernel/posix-timers.c | 31 ++++++++++++------------------- 3 files changed, 12 insertions(+), 24 deletions(-) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index cd6da067bce1..4aaf0c5c7cea 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -89,9 +89,6 @@ extern struct k_clock clock_posix_cpu; void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock); -/* error handlers for timer_create, nanosleep and settime */ -int do_posix_clock_nosettime(const clockid_t, const struct timespec *tp); - /* function to call to trigger timer event */ int posix_timer_event(struct k_itimer *timr, int si_private); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 8dc4cd7faf89..504fbab10b82 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1604,7 +1604,6 @@ static __init int init_posix_cpu_timers(void) struct k_clock process = { .clock_getres = process_cpu_clock_getres, .clock_get = process_cpu_clock_get, - .clock_set = do_posix_clock_nosettime, .timer_create = process_cpu_timer_create, .nsleep = process_cpu_nsleep, .nsleep_restart = process_cpu_nsleep_restart, @@ -1612,7 +1611,6 @@ static __init int init_posix_cpu_timers(void) struct k_clock thread = { .clock_getres = thread_cpu_clock_getres, .clock_get = thread_cpu_clock_get, - .clock_set = do_posix_clock_nosettime, .timer_create = thread_cpu_timer_create, }; struct timespec ts; diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 4762986814c8..49f358c37708 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -199,12 +199,6 @@ static int common_clock_get(clockid_t which_clock, struct timespec *tp) return 0; } -static inline int common_clock_set(const clockid_t which_clock, - const struct timespec *tp) -{ - return do_sys_settimeofday(tp, NULL); -} - static int common_timer_create(struct k_itimer *new_timer) { hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); @@ -232,6 +226,13 @@ static inline int invalid_clockid(const clockid_t which_clock) return 1; } +/* Set clock_realtime */ +static int posix_clock_realtime_set(const clockid_t which_clock, + const struct timespec *tp) +{ + return do_sys_settimeofday(tp, NULL); +} + /* * Get monotonic time for posix timers */ @@ -276,32 +277,29 @@ static __init int init_posix_timers(void) { struct k_clock clock_realtime = { .clock_getres = hrtimer_get_res, + .clock_set = posix_clock_realtime_set, .nsleep = common_nsleep, .nsleep_restart = hrtimer_nanosleep_restart, }; struct k_clock clock_monotonic = { .clock_getres = hrtimer_get_res, .clock_get = posix_ktime_get_ts, - .clock_set = do_posix_clock_nosettime, .nsleep = common_nsleep, .nsleep_restart = hrtimer_nanosleep_restart, }; struct k_clock clock_monotonic_raw = { .clock_getres = hrtimer_get_res, .clock_get = posix_get_monotonic_raw, - .clock_set = do_posix_clock_nosettime, .timer_create = no_timer_create, }; struct k_clock clock_realtime_coarse = { .clock_getres = posix_get_coarse_res, .clock_get = posix_get_realtime_coarse, - .clock_set = do_posix_clock_nosettime, .timer_create = no_timer_create, }; struct k_clock clock_monotonic_coarse = { .clock_getres = posix_get_coarse_res, .clock_get = posix_get_monotonic_coarse, - .clock_set = do_posix_clock_nosettime, .timer_create = no_timer_create, }; @@ -940,24 +938,19 @@ void exit_itimers(struct signal_struct *sig) } } -/* Not available / possible... functions */ -int do_posix_clock_nosettime(const clockid_t clockid, const struct timespec *tp) -{ - return -EINVAL; -} -EXPORT_SYMBOL_GPL(do_posix_clock_nosettime); - SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, const struct timespec __user *, tp) { + struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec new_tp; - if (invalid_clockid(which_clock)) + if (!kc || !kc->clock_set) return -EINVAL; + if (copy_from_user(&new_tp, tp, sizeof (*tp))) return -EFAULT; - return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp)); + return kc->clock_set(which_clock, &new_tp); } SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, From 42285777631aa0654fbb6442057b3e176445c6c5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:51:50 +0000 Subject: [PATCH 145/706] posix-timers: Convert clock_gettime() to clockid_to_kclock() Use the new kclock decoding mechanism and rename the misnomed common_clock_get() to posix_clock_realtime_get(). Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134418.611097203@linutronix.de> --- kernel/posix-timers.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 49f358c37708..d9e5edfe8a1b 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -190,15 +190,6 @@ static inline int common_clock_getres(const clockid_t which_clock, return 0; } -/* - * Get real time for posix timers - */ -static int common_clock_get(clockid_t which_clock, struct timespec *tp) -{ - ktime_get_real_ts(tp); - return 0; -} - static int common_timer_create(struct k_itimer *new_timer) { hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); @@ -226,6 +217,13 @@ static inline int invalid_clockid(const clockid_t which_clock) return 1; } +/* Get clock_realtime */ +static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp) +{ + ktime_get_real_ts(tp); + return 0; +} + /* Set clock_realtime */ static int posix_clock_realtime_set(const clockid_t which_clock, const struct timespec *tp) @@ -277,6 +275,7 @@ static __init int init_posix_timers(void) { struct k_clock clock_realtime = { .clock_getres = hrtimer_get_res, + .clock_get = posix_clock_realtime_get, .clock_set = posix_clock_realtime_set, .nsleep = common_nsleep, .nsleep_restart = hrtimer_nanosleep_restart, @@ -956,18 +955,21 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, struct timespec __user *,tp) { + struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec kernel_tp; int error; - if (invalid_clockid(which_clock)) + if (!kc) return -EINVAL; - error = CLOCK_DISPATCH(which_clock, clock_get, - (which_clock, &kernel_tp)); + if (!kc->clock_get) + return -EOPNOTSUPP; + + error = kc->clock_get(which_clock, &kernel_tp); + if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) error = -EFAULT; return error; - } SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, From 4359ac0ace1a2a267927390ad27f781a2f8e0ab8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 2 Feb 2011 11:45:23 +0100 Subject: [PATCH 146/706] posix-timers: Make clock_getres and clock_get mandatory Richard said: "I would think that we can require k_clocks to provide the read function. This could be checked and enforced in register_posix_clock()." Add checks for clock_getres and clock_get in the register function. Suggested-by: Richard Cochran Cc: John Stultz Signed-off-by: Thomas Gleixner --- kernel/posix-timers.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index d9e5edfe8a1b..7f66143d1ce5 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -485,7 +485,18 @@ static struct pid *good_sigevent(sigevent_t * event) void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock) { if ((unsigned) clock_id >= MAX_CLOCKS) { - printk("POSIX clock register failed for clock_id %d\n", + printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n", + clock_id); + return; + } + + if (!new_clock->clock_get) { + printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n", + clock_id); + return; + } + if (!new_clock->clock_getres) { + printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n", clock_id); return; } @@ -961,8 +972,6 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, if (!kc) return -EINVAL; - if (!kc->clock_get) - return -EOPNOTSUPP; error = kc->clock_get(which_clock, &kernel_tp); From e5e542eea9075dd008993c2ee80b2cc9f31fc494 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:51:53 +0000 Subject: [PATCH 147/706] posix-timers: Convert clock_getres() to clockid_to_kclock() Use the new kclock decoding. Fixup the fallout in mmtimer.c Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134418.709802797@linutronix.de> --- drivers/char/mmtimer.c | 10 ++++++++++ kernel/posix-timers.c | 17 ++++------------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c index 262d10453cb8..141ffaeb976c 100644 --- a/drivers/char/mmtimer.c +++ b/drivers/char/mmtimer.c @@ -53,6 +53,8 @@ MODULE_LICENSE("GPL"); #define RTC_BITS 55 /* 55 bits for this implementation */ +static struct k_clock sgi_clock; + extern unsigned long sn_rtc_cycles_per_second; #define RTC_COUNTER_ADDR ((long *)LOCAL_MMR_ADDR(SH_RTC)) @@ -763,10 +765,18 @@ static int sgi_timer_set(struct k_itimer *timr, int flags, return err; } +static int sgi_clock_getres(const clockid_t which_clock, struct timespec *tp) +{ + tp->tv_sec = 0; + tp->tv_nsec = sgi_clock.res; + return 0; +} + static struct k_clock sgi_clock = { .res = 0, .clock_set = sgi_clock_set, .clock_get = sgi_clock_get, + .clock_getres = sgi_clock_getres, .timer_create = sgi_timer_create, .timer_set = sgi_timer_set, .timer_del = sgi_timer_del, diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 7f66143d1ce5..748497fffd0f 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -182,14 +182,6 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) * the function pointer CALL in struct k_clock. */ -static inline int common_clock_getres(const clockid_t which_clock, - struct timespec *tp) -{ - tp->tv_sec = 0; - tp->tv_nsec = posix_clocks[which_clock].res; - return 0; -} - static int common_timer_create(struct k_itimer *new_timer) { hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); @@ -984,18 +976,17 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) { + struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec rtn_tp; int error; - if (invalid_clockid(which_clock)) + if (!kc) return -EINVAL; - error = CLOCK_DISPATCH(which_clock, clock_getres, - (which_clock, &rtn_tp)); + error = kc->clock_getres(which_clock, &rtn_tp); - if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) { + if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) error = -EFAULT; - } return error; } From ebaac757acae0431e2c79c00e09f1debdabbddd7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:51:56 +0000 Subject: [PATCH 148/706] posix-timers: Remove useless res field from k_clock The res member of kclock is only used by mmtimer.c, but even there it contains redundant information. Remove the field and fixup mmtimer. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134418.808714587@linutronix.de> --- drivers/char/mmtimer.c | 5 ++--- include/linux/posix-timers.h | 1 - kernel/posix-timers.c | 2 -- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c index 141ffaeb976c..ff41eb3eec92 100644 --- a/drivers/char/mmtimer.c +++ b/drivers/char/mmtimer.c @@ -768,12 +768,11 @@ static int sgi_timer_set(struct k_itimer *timr, int flags, static int sgi_clock_getres(const clockid_t which_clock, struct timespec *tp) { tp->tv_sec = 0; - tp->tv_nsec = sgi_clock.res; + tp->tv_nsec = sgi_clock_period; return 0; } static struct k_clock sgi_clock = { - .res = 0, .clock_set = sgi_clock_set, .clock_get = sgi_clock_get, .clock_getres = sgi_clock_getres, @@ -840,7 +839,7 @@ static int __init mmtimer_init(void) (unsigned long) node); } - sgi_clock_period = sgi_clock.res = NSEC_PER_SEC / sn_rtc_cycles_per_second; + sgi_clock_period = NSEC_PER_SEC / sn_rtc_cycles_per_second; register_posix_clock(CLOCK_SGI_CYCLE, &sgi_clock); printk(KERN_INFO "%s: v%s, %ld MHz\n", MMTIMER_DESC, MMTIMER_VERSION, diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 4aaf0c5c7cea..ef574d177fb6 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -67,7 +67,6 @@ struct k_itimer { }; struct k_clock { - int res; /* in nanoseconds */ int (*clock_getres) (const clockid_t which_clock, struct timespec *tp); int (*clock_set) (const clockid_t which_clock, const struct timespec *tp); diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 748497fffd0f..f9142a99b5cb 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -204,8 +204,6 @@ static inline int invalid_clockid(const clockid_t which_clock) return 1; if (posix_clocks[which_clock].clock_getres != NULL) return 0; - if (posix_clocks[which_clock].res != 0) - return 0; return 1; } From 838394fbf989973ec7f5a0ad82cb6ff09e5c39aa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:51:58 +0000 Subject: [PATCH 149/706] posix-timers: Convert timer_create() to clockid_to_kclock() Setup timer_create for CLOCK_MONOTONIC and CLOCK_REALTIME kclocks and remove the no_timer_create() implementation. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134418.903604289@linutronix.de> --- kernel/posix-timers.c | 40 +++++++++++++++------------------------- 1 file changed, 15 insertions(+), 25 deletions(-) diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index f9142a99b5cb..4f71382a4ca8 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -146,6 +146,7 @@ static struct k_clock posix_clocks[MAX_CLOCKS]; */ static int common_nsleep(const clockid_t, int flags, struct timespec *t, struct timespec __user *rmtp); +static int common_timer_create(struct k_itimer *new_timer); static void common_timer_get(struct k_itimer *, struct itimerspec *); static int common_timer_set(struct k_itimer *, int, struct itimerspec *, struct itimerspec *); @@ -174,25 +175,6 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) (posix_clocks[clock].call != NULL \ ? (*posix_clocks[clock].call) arglist : common_##call arglist)) -/* - * Default clock hook functions when the struct k_clock passed - * to register_posix_clock leaves a function pointer null. - * - * The function common_CALL is the default implementation for - * the function pointer CALL in struct k_clock. - */ - -static int common_timer_create(struct k_itimer *new_timer) -{ - hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); - return 0; -} - -static int no_timer_create(struct k_itimer *new_timer) -{ - return -EOPNOTSUPP; -} - /* * Return nonzero if we know a priori this clockid_t value is bogus. */ @@ -269,27 +251,26 @@ static __init int init_posix_timers(void) .clock_set = posix_clock_realtime_set, .nsleep = common_nsleep, .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, }; struct k_clock clock_monotonic = { .clock_getres = hrtimer_get_res, .clock_get = posix_ktime_get_ts, .nsleep = common_nsleep, .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, }; struct k_clock clock_monotonic_raw = { .clock_getres = hrtimer_get_res, .clock_get = posix_get_monotonic_raw, - .timer_create = no_timer_create, }; struct k_clock clock_realtime_coarse = { .clock_getres = posix_get_coarse_res, .clock_get = posix_get_realtime_coarse, - .timer_create = no_timer_create, }; struct k_clock clock_monotonic_coarse = { .clock_getres = posix_get_coarse_res, .clock_get = posix_get_monotonic_coarse, - .timer_create = no_timer_create, }; register_posix_clock(CLOCK_REALTIME, &clock_realtime); @@ -534,19 +515,28 @@ static struct k_clock *clockid_to_kclock(const clockid_t id) return &posix_clocks[id]; } +static int common_timer_create(struct k_itimer *new_timer) +{ + hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); + return 0; +} + /* Create a POSIX.1b interval timer. */ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, struct sigevent __user *, timer_event_spec, timer_t __user *, created_timer_id) { + struct k_clock *kc = clockid_to_kclock(which_clock); struct k_itimer *new_timer; int error, new_timer_id; sigevent_t event; int it_id_set = IT_ID_NOT_SET; - if (invalid_clockid(which_clock)) + if (!kc) return -EINVAL; + if (!kc->timer_create) + return -EOPNOTSUPP; new_timer = alloc_posix_timer(); if (unlikely(!new_timer)) @@ -608,7 +598,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, goto out; } - error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); + error = kc->timer_create(new_timer); if (error) goto out; @@ -618,7 +608,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, spin_unlock_irq(¤t->sighand->siglock); return 0; - /* + /* * In the case of the timer belonging to another task, after * the task is unlocked, the timer is owned by the other task * and may cease to exist at any time. Don't use or modify From 27722df16ef143017db55ac7baac1703a68017ff Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:52:01 +0000 Subject: [PATCH 150/706] posix-timers: Convert timer_settime() to clockid_to_kclock() Set the common function for CLOCK_MONOTONIC and CLOCK_REALTIME kclocks and use the new decoding function. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134419.001863714@linutronix.de> --- kernel/posix-timers.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 4f71382a4ca8..a4dbfe71c5a5 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -252,6 +252,7 @@ static __init int init_posix_timers(void) .nsleep = common_nsleep, .nsleep_restart = hrtimer_nanosleep_restart, .timer_create = common_timer_create, + .timer_set = common_timer_set, }; struct k_clock clock_monotonic = { .clock_getres = hrtimer_get_res, @@ -259,6 +260,7 @@ static __init int init_posix_timers(void) .nsleep = common_nsleep, .nsleep_restart = hrtimer_nanosleep_restart, .timer_create = common_timer_create, + .timer_set = common_timer_set, }; struct k_clock clock_monotonic_raw = { .clock_getres = hrtimer_get_res, @@ -814,6 +816,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, int error = 0; unsigned long flag; struct itimerspec *rtn = old_setting ? &old_spec : NULL; + struct k_clock *kc; if (!new_setting) return -EINVAL; @@ -829,8 +832,11 @@ retry: if (!timr) return -EINVAL; - error = CLOCK_DISPATCH(timr->it_clock, timer_set, - (timr, flags, &new_spec, rtn)); + kc = clockid_to_kclock(timr->it_clock); + if (WARN_ON_ONCE(!kc || !kc->timer_set)) + error = -EINVAL; + else + error = kc->timer_set(timr, flags, &new_spec, rtn); unlock_timer(timr, flag); if (error == TIMER_RETRY) { From a7319fa253a549c4c6528fb550ae6e72a9c83811 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:52:04 +0000 Subject: [PATCH 151/706] posix-timers: Convert timer_gettime() to clockid_to_kclock() Set the common function for CLOCK_MONOTONIC and CLOCK_REALTIME kclocks and use the new decoding function. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134419.101243181@linutronix.de> --- kernel/posix-timers.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index a4dbfe71c5a5..c1e2636f9e45 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -253,6 +253,7 @@ static __init int init_posix_timers(void) .nsleep_restart = hrtimer_nanosleep_restart, .timer_create = common_timer_create, .timer_set = common_timer_set, + .timer_get = common_timer_get, }; struct k_clock clock_monotonic = { .clock_getres = hrtimer_get_res, @@ -261,6 +262,7 @@ static __init int init_posix_timers(void) .nsleep_restart = hrtimer_nanosleep_restart, .timer_create = common_timer_create, .timer_set = common_timer_set, + .timer_get = common_timer_get, }; struct k_clock clock_monotonic_raw = { .clock_getres = hrtimer_get_res, @@ -712,22 +714,28 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, struct itimerspec __user *, setting) { - struct k_itimer *timr; struct itimerspec cur_setting; + struct k_itimer *timr; + struct k_clock *kc; unsigned long flags; + int ret = 0; timr = lock_timer(timer_id, &flags); if (!timr) return -EINVAL; - CLOCK_DISPATCH(timr->it_clock, timer_get, (timr, &cur_setting)); + kc = clockid_to_kclock(timr->it_clock); + if (WARN_ON_ONCE(!kc || !kc->timer_get)) + ret = -EINVAL; + else + kc->timer_get(timr, &cur_setting); unlock_timer(timr, flags); - if (copy_to_user(setting, &cur_setting, sizeof (cur_setting))) + if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting))) return -EFAULT; - return 0; + return ret; } /* From 6761c6702e2c647582e1829abe8cf90794f61d9d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:52:07 +0000 Subject: [PATCH 152/706] posix-timers: Convert timer_delete() to clockid_to_kclock() Set the common function for CLOCK_MONOTONIC and CLOCK_REALTIME kclocks and use the new decoding function. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134419.198999420@linutronix.de> --- kernel/posix-timers.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index c1e2636f9e45..ade7dec49f96 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -254,6 +254,7 @@ static __init int init_posix_timers(void) .timer_create = common_timer_create, .timer_set = common_timer_set, .timer_get = common_timer_get, + .timer_del = common_timer_del, }; struct k_clock clock_monotonic = { .clock_getres = hrtimer_get_res, @@ -263,6 +264,7 @@ static __init int init_posix_timers(void) .timer_create = common_timer_create, .timer_set = common_timer_set, .timer_get = common_timer_get, + .timer_del = common_timer_del, }; struct k_clock clock_monotonic_raw = { .clock_getres = hrtimer_get_res, @@ -859,7 +861,7 @@ retry: return error; } -static inline int common_timer_del(struct k_itimer *timer) +static int common_timer_del(struct k_itimer *timer) { timer->it.real.interval.tv64 = 0; @@ -870,7 +872,11 @@ static inline int common_timer_del(struct k_itimer *timer) static inline int timer_delete_hook(struct k_itimer *timer) { - return CLOCK_DISPATCH(timer->it_clock, timer_del, (timer)); + struct k_clock *kc = clockid_to_kclock(timer->it_clock); + + if (WARN_ON_ONCE(!kc || !kc->timer_del)) + return -EINVAL; + return kc->timer_del(timer); } /* Delete a POSIX.1b interval timer. */ From 0aa3975f02ce78f27be3076fbfa3d94ae5a659d5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:52:09 +0000 Subject: [PATCH 153/706] posix-timers: Remove CLOCK_DISPATCH leftovers All users gone. Remove the cruft. Huge thanks to Richard Cochran who tackled that maze first. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134419.294620613@linutronix.de> --- kernel/posix-timers.c | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index ade7dec49f96..ad154dfd7c51 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -167,28 +167,6 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) spin_unlock_irqrestore(&timr->it_lock, flags); } -/* - * Call the k_clock hook function if non-null, or the default function. - */ -#define CLOCK_DISPATCH(clock, call, arglist) \ - ((clock) < 0 ? posix_cpu_##call arglist : \ - (posix_clocks[clock].call != NULL \ - ? (*posix_clocks[clock].call) arglist : common_##call arglist)) - -/* - * Return nonzero if we know a priori this clockid_t value is bogus. - */ -static inline int invalid_clockid(const clockid_t which_clock) -{ - if (which_clock < 0) /* CPU clock, posix_cpu_* will check it */ - return 0; - if ((unsigned) which_clock >= MAX_CLOCKS) - return 1; - if (posix_clocks[which_clock].clock_getres != NULL) - return 0; - return 1; -} - /* Get clock_realtime */ static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp) { From bc2c8ea483d73e95fc88f1fc9e7755180f89b892 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:52:12 +0000 Subject: [PATCH 154/706] posix-timers: Make posix-cpu-timers functions static All functions are accessed via clock_posix_cpu now. So make them static. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134419.389755466@linutronix.de> --- include/linux/posix-timers.h | 12 ------------ kernel/posix-cpu-timers.c | 27 +++++++++++++++------------ 2 files changed, 15 insertions(+), 24 deletions(-) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index ef574d177fb6..8206255a547c 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -91,18 +91,6 @@ void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock); /* function to call to trigger timer event */ int posix_timer_event(struct k_itimer *timr, int si_private); -int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *ts); -int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *ts); -int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *ts); -int posix_cpu_timer_create(struct k_itimer *timer); -int posix_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, struct timespec __user *rmtp); -long posix_cpu_nsleep_restart(struct restart_block *restart_block); -int posix_cpu_timer_set(struct k_itimer *timer, int flags, - struct itimerspec *new, struct itimerspec *old); -int posix_cpu_timer_del(struct k_itimer *timer); -void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp); - void posix_cpu_timer_schedule(struct k_itimer *timer); void run_posix_cpu_timers(struct task_struct *task); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 504fbab10b82..609e187f43e7 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -176,7 +176,8 @@ static inline cputime_t virt_ticks(struct task_struct *p) return p->utime; } -int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) +static int +posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) { int error = check_clock(which_clock); if (!error) { @@ -194,7 +195,8 @@ int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) return error; } -int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) +static int +posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) { /* * You can never reset a CPU clock, but we check for other errors @@ -317,7 +319,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock, } -int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) +static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) { const pid_t pid = CPUCLOCK_PID(which_clock); int error = -EINVAL; @@ -379,7 +381,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) * This is called from sys_timer_create() and do_cpu_nanosleep() with the * new timer already all-zeros initialized. */ -int posix_cpu_timer_create(struct k_itimer *new_timer) +static int posix_cpu_timer_create(struct k_itimer *new_timer) { int ret = 0; const pid_t pid = CPUCLOCK_PID(new_timer->it_clock); @@ -425,7 +427,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) * If we return TIMER_RETRY, it's necessary to release the timer's lock * and try again. (This happens when the timer is in the middle of firing.) */ -int posix_cpu_timer_del(struct k_itimer *timer) +static int posix_cpu_timer_del(struct k_itimer *timer) { struct task_struct *p = timer->it.cpu.task; int ret = 0; @@ -665,8 +667,8 @@ static int cpu_timer_sample_group(const clockid_t which_clock, * If we return TIMER_RETRY, it's necessary to release the timer's lock * and try again. (This happens when the timer is in the middle of firing.) */ -int posix_cpu_timer_set(struct k_itimer *timer, int flags, - struct itimerspec *new, struct itimerspec *old) +static int posix_cpu_timer_set(struct k_itimer *timer, int flags, + struct itimerspec *new, struct itimerspec *old) { struct task_struct *p = timer->it.cpu.task; union cpu_time_count old_expires, new_expires, old_incr, val; @@ -820,7 +822,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, return ret; } -void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) +static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) { union cpu_time_count now; struct task_struct *p = timer->it.cpu.task; @@ -1481,8 +1483,10 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, return error; } -int posix_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, struct timespec __user *rmtp) +static long posix_cpu_nsleep_restart(struct restart_block *restart_block); + +static int posix_cpu_nsleep(const clockid_t which_clock, int flags, + struct timespec *rqtp, struct timespec __user *rmtp) { struct restart_block *restart_block = ¤t_thread_info()->restart_block; @@ -1517,7 +1521,7 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags, return error; } -long posix_cpu_nsleep_restart(struct restart_block *restart_block) +static long posix_cpu_nsleep_restart(struct restart_block *restart_block) { clockid_t which_clock = restart_block->nanosleep.index; struct timespec t; @@ -1542,7 +1546,6 @@ long posix_cpu_nsleep_restart(struct restart_block *restart_block) } - #define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) #define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) From 0061748dd2400d0bcd4d49d258db5d7b5d106ca0 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Tue, 1 Feb 2011 13:52:15 +0000 Subject: [PATCH 155/706] posix-timer: Update comment Pick the cleanup to the comment in posix-timers.c from Richards all in one conversion patch. Originally-from: Richard Cochran Signed-off-by: Thomas Gleixner Acked-by: John Stultz LKML-Reference: <20110201134419.487708516@linutronix.de> --- kernel/posix-timers.c | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index ad154dfd7c51..a3fdfd4be0ec 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -102,11 +102,7 @@ static DEFINE_SPINLOCK(idr_lock); /* * CLOCKs: The POSIX standard calls for a couple of clocks and allows us * to implement others. This structure defines the various - * clocks and allows the possibility of adding others. We - * provide an interface to add clocks to the table and expect - * the "arch" code to add at least one clock that is high - * resolution. Here we define the standard CLOCK_REALTIME as a - * 1/HZ resolution clock. + * clocks. * * RESOLUTION: Clock resolution is used to round up timer and interval * times, NOT to report clock times, which are reported with as @@ -116,20 +112,13 @@ static DEFINE_SPINLOCK(idr_lock); * necessary code is written. The standard says we should say * something about this issue in the documentation... * - * FUNCTIONS: The CLOCKs structure defines possible functions to handle - * various clock functions. For clocks that use the standard - * system timer code these entries should be NULL. This will - * allow dispatch without the overhead of indirect function - * calls. CLOCKS that depend on other sources (e.g. WWV or GPS) - * must supply functions here, even if the function just returns - * ENOSYS. The standard POSIX timer management code assumes the - * following: 1.) The k_itimer struct (sched.h) is used for the - * timer. 2.) The list, it_lock, it_clock, it_id and it_pid - * fields are not modified by timer code. + * FUNCTIONS: The CLOCKs structure defines possible functions to + * handle various clock functions. * - * At this time all functions EXCEPT clock_nanosleep can be - * redirected by the CLOCKS structure. Clock_nanosleep is in - * there, but the code ignores it. + * The standard POSIX timer management code assumes the + * following: 1.) The k_itimer struct (sched.h) is used for + * the timer. 2.) The list, it_lock, it_clock, it_id and + * it_pid fields are not modified by timer code. * * Permissions: It is assumed that the clock_settime() function defined * for each clock will take care of permission checks. Some From c528f7c6c208f1fae6b4025957173dec045e5f21 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 1 Feb 2011 13:52:17 +0000 Subject: [PATCH 156/706] time: Introduce timekeeping_inject_offset This adds a kernel-internal timekeeping interface to add or subtract a fixed amount from CLOCK_REALTIME. This makes it so kernel users or interfaces trying to do so do not have to read the time, then add an offset and then call settimeofday(), which adds some extra error in comparision to just simply adding the offset in the kernel timekeeping core. Signed-off-by: John Stultz Signed-off-by: Richard Cochran LKML-Reference: <20110201134419.584311693@linutronix.de> Signed-off-by: Thomas Gleixner --- include/linux/time.h | 1 + kernel/time/timekeeping.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/include/linux/time.h b/include/linux/time.h index 7c44e7778033..379b9037b5b4 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -166,6 +166,7 @@ extern struct timespec timespec_trunc(struct timespec t, unsigned gran); extern int timekeeping_valid_for_hres(void); extern u64 timekeeping_max_deferment(void); extern void timekeeping_leap_insert(int leapsecond); +extern int timekeeping_inject_offset(struct timespec *ts); struct tms; extern void do_sys_times(struct tms *); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 4f9f65b91323..6262c1d18397 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -387,6 +387,42 @@ int do_settimeofday(const struct timespec *tv) EXPORT_SYMBOL(do_settimeofday); + +/** + * timekeeping_inject_offset - Adds or subtracts from the current time. + * @tv: pointer to the timespec variable containing the offset + * + * Adds or subtracts an offset value from the current time. + */ +int timekeeping_inject_offset(struct timespec *ts) +{ + unsigned long flags; + + if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) + return -EINVAL; + + write_seqlock_irqsave(&xtime_lock, flags); + + timekeeping_forward_now(); + + xtime = timespec_add(xtime, *ts); + wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts); + + timekeeper.ntp_error = 0; + ntp_clear(); + + update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, + timekeeper.mult); + + write_sequnlock_irqrestore(&xtime_lock, flags); + + /* signal hrtimers about time change */ + clock_was_set(); + + return 0; +} +EXPORT_SYMBOL(timekeeping_inject_offset); + /** * change_clocksource - Swaps clocksources if a new one is available * From 094aa1881fdc1b8889b442eb3511b31f3ec2b762 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Tue, 1 Feb 2011 13:52:20 +0000 Subject: [PATCH 157/706] ntp: Add ADJ_SETOFFSET mode bit This patch adds a new mode bit into the timex structure. When set, the bit instructs the kernel to add the given time value to the current time. Signed-off-by: Richard Cochran Acked-by: John Stultz LKML-Reference: <20110201134320.688829863@linutronix.de> Signed-off-by: Thomas Gleixner --- include/linux/timex.h | 3 ++- kernel/time/ntp.c | 11 +++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/include/linux/timex.h b/include/linux/timex.h index d23999f9499d..aa60fe7b6ed6 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -73,7 +73,7 @@ struct timex { long tolerance; /* clock frequency tolerance (ppm) * (read only) */ - struct timeval time; /* (read only) */ + struct timeval time; /* (read only, except for ADJ_SETOFFSET) */ long tick; /* (modified) usecs between clock ticks */ long ppsfreq; /* pps frequency (scaled ppm) (ro) */ @@ -102,6 +102,7 @@ struct timex { #define ADJ_STATUS 0x0010 /* clock status */ #define ADJ_TIMECONST 0x0020 /* pll time constant */ #define ADJ_TAI 0x0080 /* set TAI offset */ +#define ADJ_SETOFFSET 0x0100 /* add 'time' to current time */ #define ADJ_MICRO 0x1000 /* select microsecond resolution */ #define ADJ_NANO 0x2000 /* select nanosecond resolution */ #define ADJ_TICK 0x4000 /* tick value */ diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index ed8cfdf16983..5ac593267a26 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -648,6 +648,17 @@ int do_adjtimex(struct timex *txc) hrtimer_cancel(&leap_timer); } + if (txc->modes & ADJ_SETOFFSET) { + struct timespec delta; + if ((unsigned long)txc->time.tv_usec >= NSEC_PER_SEC) + return -EINVAL; + delta.tv_sec = txc->time.tv_sec; + delta.tv_nsec = txc->time.tv_usec; + if (!(txc->modes & ADJ_NANO)) + delta.tv_nsec *= 1000; + timekeeping_inject_offset(&delta); + } + getnstimeofday(&ts); write_seqlock_irq(&xtime_lock); From 65f5d80bdf83ec0d7f3887db10153bf3f36ed73c Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Tue, 1 Feb 2011 13:52:23 +0000 Subject: [PATCH 158/706] time: Splitout compat timex accessors Split out the compat timex accessors into separate functions. Preparatory patch for a new syscall. [ tglx: Split that patch from Richards "posix-timers: Introduce a syscall for clock tuning.". Keeps the changes strictly separate ] Originally-from: Richard Cochran Acked-by: John Stultz Signed-off-by: Thomas Gleixner LKML-Reference: <20110201134419.772343089@linutronix.de> --- kernel/compat.c | 113 ++++++++++++++++++++++++++++-------------------- 1 file changed, 65 insertions(+), 48 deletions(-) diff --git a/kernel/compat.c b/kernel/compat.c index c9e2ec0b34a8..449e853cf41d 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -52,6 +52,64 @@ static int compat_put_timeval(struct compat_timeval __user *o, put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; } +static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp) +{ + memset(txc, 0, sizeof(struct timex)); + + if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || + __get_user(txc->modes, &utp->modes) || + __get_user(txc->offset, &utp->offset) || + __get_user(txc->freq, &utp->freq) || + __get_user(txc->maxerror, &utp->maxerror) || + __get_user(txc->esterror, &utp->esterror) || + __get_user(txc->status, &utp->status) || + __get_user(txc->constant, &utp->constant) || + __get_user(txc->precision, &utp->precision) || + __get_user(txc->tolerance, &utp->tolerance) || + __get_user(txc->time.tv_sec, &utp->time.tv_sec) || + __get_user(txc->time.tv_usec, &utp->time.tv_usec) || + __get_user(txc->tick, &utp->tick) || + __get_user(txc->ppsfreq, &utp->ppsfreq) || + __get_user(txc->jitter, &utp->jitter) || + __get_user(txc->shift, &utp->shift) || + __get_user(txc->stabil, &utp->stabil) || + __get_user(txc->jitcnt, &utp->jitcnt) || + __get_user(txc->calcnt, &utp->calcnt) || + __get_user(txc->errcnt, &utp->errcnt) || + __get_user(txc->stbcnt, &utp->stbcnt)) + return -EFAULT; + + return 0; +} + +static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc) +{ + if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || + __put_user(txc->modes, &utp->modes) || + __put_user(txc->offset, &utp->offset) || + __put_user(txc->freq, &utp->freq) || + __put_user(txc->maxerror, &utp->maxerror) || + __put_user(txc->esterror, &utp->esterror) || + __put_user(txc->status, &utp->status) || + __put_user(txc->constant, &utp->constant) || + __put_user(txc->precision, &utp->precision) || + __put_user(txc->tolerance, &utp->tolerance) || + __put_user(txc->time.tv_sec, &utp->time.tv_sec) || + __put_user(txc->time.tv_usec, &utp->time.tv_usec) || + __put_user(txc->tick, &utp->tick) || + __put_user(txc->ppsfreq, &utp->ppsfreq) || + __put_user(txc->jitter, &utp->jitter) || + __put_user(txc->shift, &utp->shift) || + __put_user(txc->stabil, &utp->stabil) || + __put_user(txc->jitcnt, &utp->jitcnt) || + __put_user(txc->calcnt, &utp->calcnt) || + __put_user(txc->errcnt, &utp->errcnt) || + __put_user(txc->stbcnt, &utp->stbcnt) || + __put_user(txc->tai, &utp->tai)) + return -EFAULT; + return 0; +} + asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) { @@ -951,58 +1009,17 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) { struct timex txc; - int ret; + int err, ret; - memset(&txc, 0, sizeof(struct timex)); - - if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || - __get_user(txc.modes, &utp->modes) || - __get_user(txc.offset, &utp->offset) || - __get_user(txc.freq, &utp->freq) || - __get_user(txc.maxerror, &utp->maxerror) || - __get_user(txc.esterror, &utp->esterror) || - __get_user(txc.status, &utp->status) || - __get_user(txc.constant, &utp->constant) || - __get_user(txc.precision, &utp->precision) || - __get_user(txc.tolerance, &utp->tolerance) || - __get_user(txc.time.tv_sec, &utp->time.tv_sec) || - __get_user(txc.time.tv_usec, &utp->time.tv_usec) || - __get_user(txc.tick, &utp->tick) || - __get_user(txc.ppsfreq, &utp->ppsfreq) || - __get_user(txc.jitter, &utp->jitter) || - __get_user(txc.shift, &utp->shift) || - __get_user(txc.stabil, &utp->stabil) || - __get_user(txc.jitcnt, &utp->jitcnt) || - __get_user(txc.calcnt, &utp->calcnt) || - __get_user(txc.errcnt, &utp->errcnt) || - __get_user(txc.stbcnt, &utp->stbcnt)) - return -EFAULT; + err = compat_get_timex(&txc, utp); + if (err) + return err; ret = do_adjtimex(&txc); - if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || - __put_user(txc.modes, &utp->modes) || - __put_user(txc.offset, &utp->offset) || - __put_user(txc.freq, &utp->freq) || - __put_user(txc.maxerror, &utp->maxerror) || - __put_user(txc.esterror, &utp->esterror) || - __put_user(txc.status, &utp->status) || - __put_user(txc.constant, &utp->constant) || - __put_user(txc.precision, &utp->precision) || - __put_user(txc.tolerance, &utp->tolerance) || - __put_user(txc.time.tv_sec, &utp->time.tv_sec) || - __put_user(txc.time.tv_usec, &utp->time.tv_usec) || - __put_user(txc.tick, &utp->tick) || - __put_user(txc.ppsfreq, &utp->ppsfreq) || - __put_user(txc.jitter, &utp->jitter) || - __put_user(txc.shift, &utp->shift) || - __put_user(txc.stabil, &utp->stabil) || - __put_user(txc.jitcnt, &utp->jitcnt) || - __put_user(txc.calcnt, &utp->calcnt) || - __put_user(txc.errcnt, &utp->errcnt) || - __put_user(txc.stbcnt, &utp->stbcnt) || - __put_user(txc.tai, &utp->tai)) - ret = -EFAULT; + err = compat_put_timex(utp, &txc); + if (err) + return err; return ret; } From f1f1d5ebd10ffa4242bce7a90a56a222d6b7bc77 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Tue, 1 Feb 2011 13:52:26 +0000 Subject: [PATCH 159/706] posix-timers: Introduce a syscall for clock tuning. A new syscall is introduced that allows tuning of a POSIX clock. The new call, clock_adjtime, takes two parameters, the clock ID and a pointer to a struct timex. Any ADJTIMEX(2) operation may be requested via this system call, but various POSIX clocks may or may not support tuning. [ tglx: Adapted to the posix-timer cleanup series. Avoid copy_to_user in the error case ] Signed-off-by: Richard Cochran Acked-by: John Stultz LKML-Reference: <20110201134419.869804645@linutronix.de> Signed-off-by: Thomas Gleixner --- include/linux/posix-timers.h | 2 ++ include/linux/syscalls.h | 2 ++ kernel/compat.c | 23 +++++++++++++++++++++++ kernel/posix-timers.c | 30 ++++++++++++++++++++++++++++++ 4 files changed, 57 insertions(+) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 8206255a547c..79a1cea7f6ed 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -4,6 +4,7 @@ #include #include #include +#include union cpu_time_count { cputime_t cpu; @@ -71,6 +72,7 @@ struct k_clock { int (*clock_set) (const clockid_t which_clock, const struct timespec *tp); int (*clock_get) (const clockid_t which_clock, struct timespec * tp); + int (*clock_adj) (const clockid_t which_clock, struct timex *tx); int (*timer_create) (struct k_itimer *timer); int (*nsleep) (const clockid_t which_clock, int flags, struct timespec *, struct timespec __user *); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 18cd0684fc4e..bfacab921239 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -311,6 +311,8 @@ asmlinkage long sys_clock_settime(clockid_t which_clock, const struct timespec __user *tp); asmlinkage long sys_clock_gettime(clockid_t which_clock, struct timespec __user *tp); +asmlinkage long sys_clock_adjtime(clockid_t which_clock, + struct timex __user *tx); asmlinkage long sys_clock_getres(clockid_t which_clock, struct timespec __user *tp); asmlinkage long sys_clock_nanosleep(clockid_t which_clock, int flags, diff --git a/kernel/compat.c b/kernel/compat.c index 449e853cf41d..38b1d2c1cbe8 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -675,6 +675,29 @@ long compat_sys_clock_gettime(clockid_t which_clock, return err; } +long compat_sys_clock_adjtime(clockid_t which_clock, + struct compat_timex __user *utp) +{ + struct timex txc; + mm_segment_t oldfs; + int err, ret; + + err = compat_get_timex(&txc, utp); + if (err) + return err; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc); + set_fs(oldfs); + + err = compat_put_timex(utp, &txc); + if (err) + return err; + + return ret; +} + long compat_sys_clock_getres(clockid_t which_clock, struct compat_timespec __user *tp) { diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index a3fdfd4be0ec..5a5a4f1c0971 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -170,6 +170,12 @@ static int posix_clock_realtime_set(const clockid_t which_clock, return do_sys_settimeofday(tp, NULL); } +static int posix_clock_realtime_adj(const clockid_t which_clock, + struct timex *t) +{ + return do_adjtimex(t); +} + /* * Get monotonic time for posix timers */ @@ -216,6 +222,7 @@ static __init int init_posix_timers(void) .clock_getres = hrtimer_get_res, .clock_get = posix_clock_realtime_get, .clock_set = posix_clock_realtime_set, + .clock_adj = posix_clock_realtime_adj, .nsleep = common_nsleep, .nsleep_restart = hrtimer_nanosleep_restart, .timer_create = common_timer_create, @@ -948,6 +955,29 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, return error; } +SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, + struct timex __user *, utx) +{ + struct k_clock *kc = clockid_to_kclock(which_clock); + struct timex ktx; + int err; + + if (!kc) + return -EINVAL; + if (!kc->clock_adj) + return -EOPNOTSUPP; + + if (copy_from_user(&ktx, utx, sizeof(ktx))) + return -EFAULT; + + err = kc->clock_adj(which_clock, &ktx); + + if (!err && copy_to_user(utx, &ktx, sizeof(ktx))) + return -EFAULT; + + return err; +} + SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) { From ce26efdefa5e8f22d933df72d7f7482725091d6d Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Tue, 1 Feb 2011 13:52:30 +0000 Subject: [PATCH 160/706] x86: Add clock_adjtime for x86 This patch adds the clock_adjtime system call to the x86 architecture. Signed-off-by: Richard Cochran Acked-by: John Stultz LKML-Reference: <20110201134419.968905083@linutronix.de> Signed-off-by: Thomas Gleixner --- arch/x86/ia32/ia32entry.S | 1 + arch/x86/include/asm/unistd_32.h | 3 ++- arch/x86/include/asm/unistd_64.h | 2 ++ arch/x86/kernel/syscall_table_32.S | 1 + 4 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 518bb99c3394..0ed78961b60f 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -851,4 +851,5 @@ ia32_sys_call_table: .quad sys_fanotify_init .quad sys32_fanotify_mark .quad sys_prlimit64 /* 340 */ + .quad compat_sys_clock_adjtime ia32_syscall_end: diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index b766a5e8ba0e..b6f73f1bc7ac 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -346,10 +346,11 @@ #define __NR_fanotify_init 338 #define __NR_fanotify_mark 339 #define __NR_prlimit64 340 +#define __NR_clock_adjtime 341 #ifdef __KERNEL__ -#define NR_syscalls 341 +#define NR_syscalls 342 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 363e9b8a715b..5ee30858f5d6 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -669,6 +669,8 @@ __SYSCALL(__NR_fanotify_init, sys_fanotify_init) __SYSCALL(__NR_fanotify_mark, sys_fanotify_mark) #define __NR_prlimit64 302 __SYSCALL(__NR_prlimit64, sys_prlimit64) +#define __NR_clock_adjtime 303 +__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index b35786dc9b8f..68c7b9aa5001 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -340,3 +340,4 @@ ENTRY(sys_call_table) .long sys_fanotify_init .long sys_fanotify_mark .long sys_prlimit64 /* 340 */ + .long sys_clock_adjtime From 81e294cba2596f5f10848bbe19d98b344c2a2d5c Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Tue, 1 Feb 2011 13:52:32 +0000 Subject: [PATCH 161/706] posix-timers: Add support for fd based clocks Extend the negative clockids which are currently used by posix cpu timers to encode the PID with a file descriptor based type which encodes the fd in the upper bits. Originally-from: Richard Cochran Signed-off-by: Thomas Gleixner Acked-by: John Stultz LKML-Reference: <20110201134420.062860200@linutronix.de> --- include/linux/posix-timers.h | 13 +++++++++++++ kernel/posix-timers.c | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 79a1cea7f6ed..88b9256169f8 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -18,6 +18,17 @@ struct cpu_timer_list { int firing; }; +/* + * Bit fields within a clockid: + * + * The most significant 29 bits hold either a pid or a file descriptor. + * + * Bit 2 indicates whether a cpu clock refers to a thread or a process. + * + * Bits 1 and 0 give the type: PROF=0, VIRT=1, SCHED=2, or FD=3. + * + * A clockid is invalid if bits 2, 1, and 0 are all set. + */ #define CPUCLOCK_PID(clock) ((pid_t) ~((clock) >> 3)) #define CPUCLOCK_PERTHREAD(clock) \ (((clock) & (clockid_t) CPUCLOCK_PERTHREAD_MASK) != 0) @@ -29,6 +40,8 @@ struct cpu_timer_list { #define CPUCLOCK_VIRT 1 #define CPUCLOCK_SCHED 2 #define CPUCLOCK_MAX 3 +#define CLOCKFD CPUCLOCK_MAX +#define CLOCKFD_MASK (CPUCLOCK_PERTHREAD_MASK|CPUCLOCK_CLOCK_MASK) #define MAKE_PROCESS_CPUCLOCK(pid, clock) \ ((~(clockid_t) (pid) << 3) | (clockid_t) (clock)) diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 5a5a4f1c0971..df629d853a81 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -488,7 +488,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) static struct k_clock *clockid_to_kclock(const clockid_t id) { if (id < 0) - return &clock_posix_cpu; + return (id & CLOCKFD_MASK) == CLOCKFD ? NULL : &clock_posix_cpu; if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres) return NULL; From 527087374faa488776a789375a7d6ea74fda6f71 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 2 Feb 2011 12:10:09 +0100 Subject: [PATCH 162/706] posix-timers: Cleanup namespace Rename register_posix_clock() to posix_timers_register_clock(). That's what the function really does. As a side effect this cleans up the posix_clock namespace for the upcoming dynamic posix_clock infrastructure. Signed-off-by: Thomas Gleixner Tested-by: Richard Cochran Cc: John Stultz LKML-Reference: --- drivers/char/mmtimer.c | 2 +- include/linux/posix-timers.h | 2 +- kernel/posix-cpu-timers.c | 4 ++-- kernel/posix-timers.c | 15 ++++++++------- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c index ff41eb3eec92..33dc2298af73 100644 --- a/drivers/char/mmtimer.c +++ b/drivers/char/mmtimer.c @@ -840,7 +840,7 @@ static int __init mmtimer_init(void) } sgi_clock_period = NSEC_PER_SEC / sn_rtc_cycles_per_second; - register_posix_clock(CLOCK_SGI_CYCLE, &sgi_clock); + posix_timers_register_clock(CLOCK_SGI_CYCLE, &sgi_clock); printk(KERN_INFO "%s: v%s, %ld MHz\n", MMTIMER_DESC, MMTIMER_VERSION, sn_rtc_cycles_per_second/(unsigned long)1E6); diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 88b9256169f8..9d6ffe2c92e5 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -101,7 +101,7 @@ struct k_clock { extern struct k_clock clock_posix_cpu; -void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock); +void posix_timers_register_clock(const clockid_t clock_id, struct k_clock *new_clock); /* function to call to trigger timer event */ int posix_timer_event(struct k_itimer *timr, int si_private); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 609e187f43e7..67fea9d25d55 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1618,8 +1618,8 @@ static __init int init_posix_cpu_timers(void) }; struct timespec ts; - register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); - register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); + posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process); + posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread); cputime_to_timespec(cputime_one_jiffy, &ts); onecputick = ts.tv_nsec; diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index df629d853a81..af936fd37140 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -253,11 +253,11 @@ static __init int init_posix_timers(void) .clock_get = posix_get_monotonic_coarse, }; - register_posix_clock(CLOCK_REALTIME, &clock_realtime); - register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); - register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); - register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); - register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); + posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime); + posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic); + posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); + posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); + posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof (struct k_itimer), 0, SLAB_PANIC, @@ -433,7 +433,8 @@ static struct pid *good_sigevent(sigevent_t * event) return task_pid(rtn); } -void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock) +void posix_timers_register_clock(const clockid_t clock_id, + struct k_clock *new_clock) { if ((unsigned) clock_id >= MAX_CLOCKS) { printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n", @@ -454,7 +455,7 @@ void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock) posix_clocks[clock_id] = *new_clock; } -EXPORT_SYMBOL_GPL(register_posix_clock); +EXPORT_SYMBOL_GPL(posix_timers_register_clock); static struct k_itimer * alloc_posix_timer(void) { From 0606f422b453f76c31ab2b1bd52943ff06a2dcf2 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Tue, 1 Feb 2011 13:52:35 +0000 Subject: [PATCH 163/706] posix clocks: Introduce dynamic clocks This patch adds support for adding and removing posix clocks. The clock lifetime cycle is patterned after usb devices. Each clock is represented by a standard character device. In addition, the driver may optionally implement custom character device operations. The posix clock and timer system calls listed below now work with dynamic posix clocks, as well as the traditional static clocks. The following system calls are affected: - clock_adjtime (brand new syscall) - clock_gettime - clock_getres - clock_settime - timer_create - timer_delete - timer_gettime - timer_settime [ tglx: Adapted to the posix-timer cleanup. Moved clock_posix_dynamic to posix-clock.c and made all referenced functions static ] Signed-off-by: Richard Cochran Acked-by: John Stultz LKML-Reference: <20110201134420.164172635@linutronix.de> Signed-off-by: Thomas Gleixner --- include/linux/posix-clock.h | 150 ++++++++++++ include/linux/posix-timers.h | 6 +- kernel/posix-timers.c | 4 +- kernel/time/Makefile | 3 +- kernel/time/posix-clock.c | 441 +++++++++++++++++++++++++++++++++++ 5 files changed, 601 insertions(+), 3 deletions(-) create mode 100644 include/linux/posix-clock.h create mode 100644 kernel/time/posix-clock.c diff --git a/include/linux/posix-clock.h b/include/linux/posix-clock.h new file mode 100644 index 000000000000..369e19d3750b --- /dev/null +++ b/include/linux/posix-clock.h @@ -0,0 +1,150 @@ +/* + * posix-clock.h - support for dynamic clock devices + * + * Copyright (C) 2010 OMICRON electronics GmbH + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#ifndef _LINUX_POSIX_CLOCK_H_ +#define _LINUX_POSIX_CLOCK_H_ + +#include +#include +#include +#include + +struct posix_clock; + +/** + * struct posix_clock_operations - functional interface to the clock + * + * Every posix clock is represented by a character device. Drivers may + * optionally offer extended capabilities by implementing the + * character device methods. The character device file operations are + * first handled by the clock device layer, then passed on to the + * driver by calling these functions. + * + * @owner: The clock driver should set to THIS_MODULE + * @clock_adjtime: Adjust the clock + * @clock_gettime: Read the current time + * @clock_getres: Get the clock resolution + * @clock_settime: Set the current time value + * @timer_create: Create a new timer + * @timer_delete: Remove a previously created timer + * @timer_gettime: Get remaining time and interval of a timer + * @timer_setttime: Set a timer's initial expiration and interval + * @fasync: Optional character device fasync method + * @mmap: Optional character device mmap method + * @open: Optional character device open method + * @release: Optional character device release method + * @ioctl: Optional character device ioctl method + * @read: Optional character device read method + * @poll: Optional character device poll method + */ +struct posix_clock_operations { + struct module *owner; + + int (*clock_adjtime)(struct posix_clock *pc, struct timex *tx); + + int (*clock_gettime)(struct posix_clock *pc, struct timespec *ts); + + int (*clock_getres) (struct posix_clock *pc, struct timespec *ts); + + int (*clock_settime)(struct posix_clock *pc, + const struct timespec *ts); + + int (*timer_create) (struct posix_clock *pc, struct k_itimer *kit); + + int (*timer_delete) (struct posix_clock *pc, struct k_itimer *kit); + + void (*timer_gettime)(struct posix_clock *pc, + struct k_itimer *kit, struct itimerspec *tsp); + + int (*timer_settime)(struct posix_clock *pc, + struct k_itimer *kit, int flags, + struct itimerspec *tsp, struct itimerspec *old); + /* + * Optional character device methods: + */ + int (*fasync) (struct posix_clock *pc, + int fd, struct file *file, int on); + + long (*ioctl) (struct posix_clock *pc, + unsigned int cmd, unsigned long arg); + + int (*mmap) (struct posix_clock *pc, + struct vm_area_struct *vma); + + int (*open) (struct posix_clock *pc, fmode_t f_mode); + + uint (*poll) (struct posix_clock *pc, + struct file *file, poll_table *wait); + + int (*release) (struct posix_clock *pc); + + ssize_t (*read) (struct posix_clock *pc, + uint flags, char __user *buf, size_t cnt); +}; + +/** + * struct posix_clock - represents a dynamic posix clock + * + * @ops: Functional interface to the clock + * @cdev: Character device instance for this clock + * @kref: Reference count. + * @mutex: Protects the 'zombie' field from concurrent access. + * @zombie: If 'zombie' is true, then the hardware has disappeared. + * @release: A function to free the structure when the reference count reaches + * zero. May be NULL if structure is statically allocated. + * + * Drivers should embed their struct posix_clock within a private + * structure, obtaining a reference to it during callbacks using + * container_of(). + */ +struct posix_clock { + struct posix_clock_operations ops; + struct cdev cdev; + struct kref kref; + struct mutex mutex; + bool zombie; + void (*release)(struct posix_clock *clk); +}; + +/** + * posix_clock_register() - register a new clock + * @clk: Pointer to the clock. Caller must provide 'ops' and 'release' + * @devid: Allocated device id + * + * A clock driver calls this function to register itself with the + * clock device subsystem. If 'clk' points to dynamically allocated + * memory, then the caller must provide a 'release' function to free + * that memory. + * + * Returns zero on success, non-zero otherwise. + */ +int posix_clock_register(struct posix_clock *clk, dev_t devid); + +/** + * posix_clock_unregister() - unregister a clock + * @clk: Clock instance previously registered via posix_clock_register() + * + * A clock driver calls this function to remove itself from the clock + * device subsystem. The posix_clock itself will remain (in an + * inactive state) until its reference count drops to zero, at which + * point it will be deallocated with its 'release' method. + */ +void posix_clock_unregister(struct posix_clock *clk); + +#endif diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 9d6ffe2c92e5..d51243ae0726 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -32,7 +32,7 @@ struct cpu_timer_list { #define CPUCLOCK_PID(clock) ((pid_t) ~((clock) >> 3)) #define CPUCLOCK_PERTHREAD(clock) \ (((clock) & (clockid_t) CPUCLOCK_PERTHREAD_MASK) != 0) -#define CPUCLOCK_PID_MASK 7 + #define CPUCLOCK_PERTHREAD_MASK 4 #define CPUCLOCK_WHICH(clock) ((clock) & (clockid_t) CPUCLOCK_CLOCK_MASK) #define CPUCLOCK_CLOCK_MASK 3 @@ -48,6 +48,9 @@ struct cpu_timer_list { #define MAKE_THREAD_CPUCLOCK(tid, clock) \ MAKE_PROCESS_CPUCLOCK((tid), (clock) | CPUCLOCK_PERTHREAD_MASK) +#define FD_TO_CLOCKID(fd) ((~(clockid_t) (fd) << 3) | CLOCKFD) +#define CLOCKID_TO_FD(clk) ((unsigned int) ~((clk) >> 3)) + /* POSIX.1b interval timer structure. */ struct k_itimer { struct list_head list; /* free/ allocate list */ @@ -100,6 +103,7 @@ struct k_clock { }; extern struct k_clock clock_posix_cpu; +extern struct k_clock clock_posix_dynamic; void posix_timers_register_clock(const clockid_t clock_id, struct k_clock *new_clock); diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index af936fd37140..44fcff131b38 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -489,7 +490,8 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) static struct k_clock *clockid_to_kclock(const clockid_t id) { if (id < 0) - return (id & CLOCKFD_MASK) == CLOCKFD ? NULL : &clock_posix_cpu; + return (id & CLOCKFD_MASK) == CLOCKFD ? + &clock_posix_dynamic : &clock_posix_cpu; if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres) return NULL; diff --git a/kernel/time/Makefile b/kernel/time/Makefile index ee266620b06c..b0425991e9ac 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,4 +1,5 @@ -obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o +obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o +obj-y += timeconv.o posix-clock.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c new file mode 100644 index 000000000000..04498cbf6002 --- /dev/null +++ b/kernel/time/posix-clock.c @@ -0,0 +1,441 @@ +/* + * posix-clock.c - support for dynamic clock devices + * + * Copyright (C) 2010 OMICRON electronics GmbH + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#include +#include +#include +#include +#include +#include +#include + +static void delete_clock(struct kref *kref); + +/* + * Returns NULL if the posix_clock instance attached to 'fp' is old and stale. + */ +static struct posix_clock *get_posix_clock(struct file *fp) +{ + struct posix_clock *clk = fp->private_data; + + mutex_lock(&clk->mutex); + + if (!clk->zombie) + return clk; + + mutex_unlock(&clk->mutex); + + return NULL; +} + +static void put_posix_clock(struct posix_clock *clk) +{ + mutex_unlock(&clk->mutex); +} + +static ssize_t posix_clock_read(struct file *fp, char __user *buf, + size_t count, loff_t *ppos) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -EINVAL; + + if (!clk) + return -ENODEV; + + if (clk->ops.read) + err = clk->ops.read(clk, fp->f_flags, buf, count); + + put_posix_clock(clk); + + return err; +} + +static unsigned int posix_clock_poll(struct file *fp, poll_table *wait) +{ + struct posix_clock *clk = get_posix_clock(fp); + int result = 0; + + if (!clk) + return -ENODEV; + + if (clk->ops.poll) + result = clk->ops.poll(clk, fp, wait); + + put_posix_clock(clk); + + return result; +} + +static int posix_clock_fasync(int fd, struct file *fp, int on) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = 0; + + if (!clk) + return -ENODEV; + + if (clk->ops.fasync) + err = clk->ops.fasync(clk, fd, fp, on); + + put_posix_clock(clk); + + return err; +} + +static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -ENODEV; + + if (!clk) + return -ENODEV; + + if (clk->ops.mmap) + err = clk->ops.mmap(clk, vma); + + put_posix_clock(clk); + + return err; +} + +static long posix_clock_ioctl(struct file *fp, + unsigned int cmd, unsigned long arg) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -ENOTTY; + + if (!clk) + return -ENODEV; + + if (clk->ops.ioctl) + err = clk->ops.ioctl(clk, cmd, arg); + + put_posix_clock(clk); + + return err; +} + +#ifdef CONFIG_COMPAT +static long posix_clock_compat_ioctl(struct file *fp, + unsigned int cmd, unsigned long arg) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -ENOTTY; + + if (!clk) + return -ENODEV; + + if (clk->ops.ioctl) + err = clk->ops.ioctl(clk, cmd, arg); + + put_posix_clock(clk); + + return err; +} +#endif + +static int posix_clock_open(struct inode *inode, struct file *fp) +{ + int err; + struct posix_clock *clk = + container_of(inode->i_cdev, struct posix_clock, cdev); + + mutex_lock(&clk->mutex); + + if (clk->zombie) { + err = -ENODEV; + goto out; + } + if (clk->ops.open) + err = clk->ops.open(clk, fp->f_mode); + else + err = 0; + + if (!err) { + kref_get(&clk->kref); + fp->private_data = clk; + } +out: + mutex_unlock(&clk->mutex); + return err; +} + +static int posix_clock_release(struct inode *inode, struct file *fp) +{ + struct posix_clock *clk = fp->private_data; + int err = 0; + + if (clk->ops.release) + err = clk->ops.release(clk); + + kref_put(&clk->kref, delete_clock); + + fp->private_data = NULL; + + return err; +} + +static const struct file_operations posix_clock_file_operations = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .read = posix_clock_read, + .poll = posix_clock_poll, + .unlocked_ioctl = posix_clock_ioctl, + .open = posix_clock_open, + .release = posix_clock_release, + .fasync = posix_clock_fasync, + .mmap = posix_clock_mmap, +#ifdef CONFIG_COMPAT + .compat_ioctl = posix_clock_compat_ioctl, +#endif +}; + +int posix_clock_register(struct posix_clock *clk, dev_t devid) +{ + int err; + + kref_init(&clk->kref); + mutex_init(&clk->mutex); + + cdev_init(&clk->cdev, &posix_clock_file_operations); + clk->cdev.owner = clk->ops.owner; + err = cdev_add(&clk->cdev, devid, 1); + if (err) + goto no_cdev; + + return err; +no_cdev: + mutex_destroy(&clk->mutex); + return err; +} +EXPORT_SYMBOL_GPL(posix_clock_register); + +static void delete_clock(struct kref *kref) +{ + struct posix_clock *clk = container_of(kref, struct posix_clock, kref); + mutex_destroy(&clk->mutex); + if (clk->release) + clk->release(clk); +} + +void posix_clock_unregister(struct posix_clock *clk) +{ + cdev_del(&clk->cdev); + + mutex_lock(&clk->mutex); + clk->zombie = true; + mutex_unlock(&clk->mutex); + + kref_put(&clk->kref, delete_clock); +} +EXPORT_SYMBOL_GPL(posix_clock_unregister); + +struct posix_clock_desc { + struct file *fp; + struct posix_clock *clk; +}; + +static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd) +{ + struct file *fp = fget(CLOCKID_TO_FD(id)); + int err = -EINVAL; + + if (!fp) + return err; + + if (fp->f_op->open != posix_clock_open || !fp->private_data) + goto out; + + cd->fp = fp; + cd->clk = get_posix_clock(fp); + + err = cd->clk ? 0 : -ENODEV; +out: + if (err) + fput(fp); + return err; +} + +static void put_clock_desc(struct posix_clock_desc *cd) +{ + put_posix_clock(cd->clk); + fput(cd->fp); +} + +static int pc_clock_adjtime(clockid_t id, struct timex *tx) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.clock_adjtime) + err = cd.clk->ops.clock_adjtime(cd.clk, tx); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static int pc_clock_gettime(clockid_t id, struct timespec *ts) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.clock_gettime) + err = cd.clk->ops.clock_gettime(cd.clk, ts); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static int pc_clock_getres(clockid_t id, struct timespec *ts) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.clock_getres) + err = cd.clk->ops.clock_getres(cd.clk, ts); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static int pc_clock_settime(clockid_t id, const struct timespec *ts) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.clock_settime) + err = cd.clk->ops.clock_settime(cd.clk, ts); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static int pc_timer_create(struct k_itimer *kit) +{ + clockid_t id = kit->it_clock; + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.timer_create) + err = cd.clk->ops.timer_create(cd.clk, kit); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static int pc_timer_delete(struct k_itimer *kit) +{ + clockid_t id = kit->it_clock; + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.timer_delete) + err = cd.clk->ops.timer_delete(cd.clk, kit); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts) +{ + clockid_t id = kit->it_clock; + struct posix_clock_desc cd; + + if (get_clock_desc(id, &cd)) + return; + + if (cd.clk->ops.timer_gettime) + cd.clk->ops.timer_gettime(cd.clk, kit, ts); + + put_clock_desc(&cd); +} + +static int pc_timer_settime(struct k_itimer *kit, int flags, + struct itimerspec *ts, struct itimerspec *old) +{ + clockid_t id = kit->it_clock; + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.timer_settime) + err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +struct k_clock clock_posix_dynamic = { + .clock_getres = pc_clock_getres, + .clock_set = pc_clock_settime, + .clock_get = pc_clock_gettime, + .clock_adj = pc_clock_adjtime, + .timer_create = pc_timer_create, + .timer_set = pc_timer_settime, + .timer_del = pc_timer_delete, + .timer_get = pc_timer_gettime, +}; From fe4b04fa31a6dcf4358aa84cf81e5a7fd079469b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Feb 2011 13:19:09 +0100 Subject: [PATCH 164/706] perf: Cure task_oncpu_function_call() races Oleg reported that on architectures with __ARCH_WANT_INTERRUPTS_ON_CTXSW the IPI from task_oncpu_function_call() can land before perf_event_task_sched_in() and cause interesting situations for eg. perf_install_in_context(). This patch reworks the task_oncpu_function_call() interface to give a more usable primitive as well as rework all its users to hopefully be more obvious as well as remove the races. While looking at the code I also found a number of races against perf_event_task_sched_out() which can flip contexts between tasks so plug those too. Reported-and-reviewed-by: Oleg Nesterov Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/sched.h | 7 -- kernel/perf_event.c | 260 +++++++++++++++++++++++++++--------------- kernel/sched.c | 29 +---- 3 files changed, 172 insertions(+), 124 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index d747f948b34e..0b40ee3f6d7a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2578,13 +2578,6 @@ static inline void inc_syscw(struct task_struct *tsk) #define TASK_SIZE_OF(tsk) TASK_SIZE #endif -/* - * Call the function if the target task is executing on a CPU right now: - */ -extern void task_oncpu_function_call(struct task_struct *p, - void (*func) (void *info), void *info); - - #ifdef CONFIG_MM_OWNER extern void mm_update_next_owner(struct mm_struct *mm); extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 126a302c481c..7d3faa25e136 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -38,6 +38,79 @@ #include +struct remote_function_call { + struct task_struct *p; + int (*func)(void *info); + void *info; + int ret; +}; + +static void remote_function(void *data) +{ + struct remote_function_call *tfc = data; + struct task_struct *p = tfc->p; + + if (p) { + tfc->ret = -EAGAIN; + if (task_cpu(p) != smp_processor_id() || !task_curr(p)) + return; + } + + tfc->ret = tfc->func(tfc->info); +} + +/** + * task_function_call - call a function on the cpu on which a task runs + * @p: the task to evaluate + * @func: the function to be called + * @info: the function call argument + * + * Calls the function @func when the task is currently running. This might + * be on the current CPU, which just calls the function directly + * + * returns: @func return value, or + * -ESRCH - when the process isn't running + * -EAGAIN - when the process moved away + */ +static int +task_function_call(struct task_struct *p, int (*func) (void *info), void *info) +{ + struct remote_function_call data = { + .p = p, + .func = func, + .info = info, + .ret = -ESRCH, /* No such (running) process */ + }; + + if (task_curr(p)) + smp_call_function_single(task_cpu(p), remote_function, &data, 1); + + return data.ret; +} + +/** + * cpu_function_call - call a function on the cpu + * @func: the function to be called + * @info: the function call argument + * + * Calls the function @func on the remote cpu. + * + * returns: @func return value or -ENXIO when the cpu is offline + */ +static int cpu_function_call(int cpu, int (*func) (void *info), void *info) +{ + struct remote_function_call data = { + .p = NULL, + .func = func, + .info = info, + .ret = -ENXIO, /* No such CPU */ + }; + + smp_call_function_single(cpu, remote_function, &data, 1); + + return data.ret; +} + enum event_type_t { EVENT_FLEXIBLE = 0x1, EVENT_PINNED = 0x2, @@ -254,7 +327,6 @@ static void perf_unpin_context(struct perf_event_context *ctx) raw_spin_lock_irqsave(&ctx->lock, flags); --ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); - put_ctx(ctx); } /* @@ -618,35 +690,24 @@ __get_cpu_context(struct perf_event_context *ctx) * We disable the event on the hardware level first. After that we * remove it from the context list. */ -static void __perf_event_remove_from_context(void *info) +static int __perf_remove_from_context(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; - raw_spin_lock(&ctx->lock); - event_sched_out(event, cpuctx, ctx); - list_del_event(event, ctx); - raw_spin_unlock(&ctx->lock); + + return 0; } /* * Remove the event from a task's (or a CPU's) list of events. * - * Must be called with ctx->mutex held. - * * CPU events are removed with a smp call. For task events we only * call when the task is on a CPU. * @@ -657,49 +718,48 @@ static void __perf_event_remove_from_context(void *info) * When called from perf_event_exit_task, it's OK because the * context has been detached from its task. */ -static void perf_event_remove_from_context(struct perf_event *event) +static void perf_remove_from_context(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; + lockdep_assert_held(&ctx->mutex); + if (!task) { /* * Per cpu events are removed via an smp call and * the removal is always successful. */ - smp_call_function_single(event->cpu, - __perf_event_remove_from_context, - event, 1); + cpu_function_call(event->cpu, __perf_remove_from_context, event); return; } retry: - task_oncpu_function_call(task, __perf_event_remove_from_context, - event); + if (!task_function_call(task, __perf_remove_from_context, event)) + return; raw_spin_lock_irq(&ctx->lock); /* - * If the context is active we need to retry the smp call. + * If we failed to find a running task, but find the context active now + * that we've acquired the ctx->lock, retry. */ - if (ctx->nr_active && !list_empty(&event->group_entry)) { + if (ctx->is_active) { raw_spin_unlock_irq(&ctx->lock); goto retry; } /* - * The lock prevents that this context is scheduled in so we - * can remove the event safely, if the call above did not - * succeed. + * Since the task isn't running, its safe to remove the event, us + * holding the ctx->lock ensures the task won't get scheduled in. */ - if (!list_empty(&event->group_entry)) - list_del_event(event, ctx); + list_del_event(event, ctx); raw_spin_unlock_irq(&ctx->lock); } /* * Cross CPU call to disable a performance event */ -static void __perf_event_disable(void *info) +static int __perf_event_disable(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; @@ -708,9 +768,12 @@ static void __perf_event_disable(void *info) /* * If this is a per-task event, need to check whether this * event's task is the current task on this cpu. + * + * Can trigger due to concurrent perf_event_context_sched_out() + * flipping contexts around. */ if (ctx->task && cpuctx->task_ctx != ctx) - return; + return -EINVAL; raw_spin_lock(&ctx->lock); @@ -729,6 +792,8 @@ static void __perf_event_disable(void *info) } raw_spin_unlock(&ctx->lock); + + return 0; } /* @@ -753,13 +818,13 @@ void perf_event_disable(struct perf_event *event) /* * Disable the event on the cpu that it's on */ - smp_call_function_single(event->cpu, __perf_event_disable, - event, 1); + cpu_function_call(event->cpu, __perf_event_disable, event); return; } retry: - task_oncpu_function_call(task, __perf_event_disable, event); + if (!task_function_call(task, __perf_event_disable, event)) + return; raw_spin_lock_irq(&ctx->lock); /* @@ -767,6 +832,11 @@ retry: */ if (event->state == PERF_EVENT_STATE_ACTIVE) { raw_spin_unlock_irq(&ctx->lock); + /* + * Reload the task pointer, it might have been changed by + * a concurrent perf_event_context_sched_out(). + */ + task = ctx->task; goto retry; } @@ -778,7 +848,6 @@ retry: update_group_times(event); event->state = PERF_EVENT_STATE_OFF; } - raw_spin_unlock_irq(&ctx->lock); } @@ -928,12 +997,14 @@ static void add_event_to_ctx(struct perf_event *event, event->tstamp_stopped = tstamp; } +static void perf_event_context_sched_in(struct perf_event_context *ctx); + /* * Cross CPU call to install and enable a performance event * * Must be called with ctx->mutex held */ -static void __perf_install_in_context(void *info) +static int __perf_install_in_context(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; @@ -942,17 +1013,12 @@ static void __perf_install_in_context(void *info) int err; /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. - * Or possibly this is the right context but it isn't - * on this cpu because it had no events. + * In case we're installing a new context to an already running task, + * could also happen before perf_event_task_sched_in() on architectures + * which do context switches with IRQs enabled. */ - if (ctx->task && cpuctx->task_ctx != ctx) { - if (cpuctx->task_ctx || ctx->task != current) - return; - cpuctx->task_ctx = ctx; - } + if (ctx->task && !cpuctx->task_ctx) + perf_event_context_sched_in(ctx); raw_spin_lock(&ctx->lock); ctx->is_active = 1; @@ -997,6 +1063,8 @@ static void __perf_install_in_context(void *info) unlock: raw_spin_unlock(&ctx->lock); + + return 0; } /* @@ -1008,8 +1076,6 @@ unlock: * If the event is attached to a task which is on a CPU we use a smp * call to enable it in the task context. The task might have been * scheduled away, but we check this in the smp call again. - * - * Must be called with ctx->mutex held. */ static void perf_install_in_context(struct perf_event_context *ctx, @@ -1018,6 +1084,8 @@ perf_install_in_context(struct perf_event_context *ctx, { struct task_struct *task = ctx->task; + lockdep_assert_held(&ctx->mutex); + event->ctx = ctx; if (!task) { @@ -1025,31 +1093,29 @@ perf_install_in_context(struct perf_event_context *ctx, * Per cpu events are installed via an smp call and * the install is always successful. */ - smp_call_function_single(cpu, __perf_install_in_context, - event, 1); + cpu_function_call(cpu, __perf_install_in_context, event); return; } retry: - task_oncpu_function_call(task, __perf_install_in_context, - event); + if (!task_function_call(task, __perf_install_in_context, event)) + return; raw_spin_lock_irq(&ctx->lock); /* - * we need to retry the smp call. + * If we failed to find a running task, but find the context active now + * that we've acquired the ctx->lock, retry. */ - if (ctx->is_active && list_empty(&event->group_entry)) { + if (ctx->is_active) { raw_spin_unlock_irq(&ctx->lock); goto retry; } /* - * The lock prevents that this context is scheduled in so we - * can add the event safely, if it the call above did not - * succeed. + * Since the task isn't running, its safe to add the event, us holding + * the ctx->lock ensures the task won't get scheduled in. */ - if (list_empty(&event->group_entry)) - add_event_to_ctx(event, ctx); + add_event_to_ctx(event, ctx); raw_spin_unlock_irq(&ctx->lock); } @@ -1078,7 +1144,7 @@ static void __perf_event_mark_enabled(struct perf_event *event, /* * Cross CPU call to enable a performance event */ -static void __perf_event_enable(void *info) +static int __perf_event_enable(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; @@ -1086,18 +1152,10 @@ static void __perf_event_enable(void *info) struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); int err; - /* - * If this is a per-task event, need to check whether this - * event's task is the current task on this cpu. - */ - if (ctx->task && cpuctx->task_ctx != ctx) { - if (cpuctx->task_ctx || ctx->task != current) - return; - cpuctx->task_ctx = ctx; - } + if (WARN_ON_ONCE(!ctx->is_active)) + return -EINVAL; raw_spin_lock(&ctx->lock); - ctx->is_active = 1; update_context_time(ctx); if (event->state >= PERF_EVENT_STATE_INACTIVE) @@ -1138,6 +1196,8 @@ static void __perf_event_enable(void *info) unlock: raw_spin_unlock(&ctx->lock); + + return 0; } /* @@ -1158,8 +1218,7 @@ void perf_event_enable(struct perf_event *event) /* * Enable the event on the cpu that it's on */ - smp_call_function_single(event->cpu, __perf_event_enable, - event, 1); + cpu_function_call(event->cpu, __perf_event_enable, event); return; } @@ -1178,8 +1237,15 @@ void perf_event_enable(struct perf_event *event) event->state = PERF_EVENT_STATE_OFF; retry: + if (!ctx->is_active) { + __perf_event_mark_enabled(event, ctx); + goto out; + } + raw_spin_unlock_irq(&ctx->lock); - task_oncpu_function_call(task, __perf_event_enable, event); + + if (!task_function_call(task, __perf_event_enable, event)) + return; raw_spin_lock_irq(&ctx->lock); @@ -1187,15 +1253,14 @@ retry: * If the context is active and the event is still off, * we need to retry the cross-call. */ - if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) + if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) { + /* + * task could have been flipped by a concurrent + * perf_event_context_sched_out() + */ + task = ctx->task; goto retry; - - /* - * Since we have the lock this context can't be scheduled - * in, so we can change the state safely. - */ - if (event->state == PERF_EVENT_STATE_OFF) - __perf_event_mark_enabled(event, ctx); + } out: raw_spin_unlock_irq(&ctx->lock); @@ -1339,8 +1404,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, } } -void perf_event_context_sched_out(struct task_struct *task, int ctxn, - struct task_struct *next) +static void perf_event_context_sched_out(struct task_struct *task, int ctxn, + struct task_struct *next) { struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; struct perf_event_context *next_ctx; @@ -1533,7 +1598,7 @@ static void task_ctx_sched_in(struct perf_event_context *ctx, { struct perf_cpu_context *cpuctx; - cpuctx = __get_cpu_context(ctx); + cpuctx = __get_cpu_context(ctx); if (cpuctx->task_ctx == ctx) return; @@ -1541,7 +1606,7 @@ static void task_ctx_sched_in(struct perf_event_context *ctx, cpuctx->task_ctx = ctx; } -void perf_event_context_sched_in(struct perf_event_context *ctx) +static void perf_event_context_sched_in(struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx; @@ -1627,7 +1692,7 @@ static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) * Reduce accuracy by one bit such that @a and @b converge * to a similar magnitude. */ -#define REDUCE_FLS(a, b) \ +#define REDUCE_FLS(a, b) \ do { \ if (a##_fls > b##_fls) { \ a >>= 1; \ @@ -2213,6 +2278,9 @@ errout: } +/* + * Returns a matching context with refcount and pincount. + */ static struct perf_event_context * find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) { @@ -2237,6 +2305,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); ctx = &cpuctx->ctx; get_ctx(ctx); + ++ctx->pin_count; return ctx; } @@ -2250,6 +2319,7 @@ retry: ctx = perf_lock_task_context(task, ctxn, &flags); if (ctx) { unclone_ctx(ctx); + ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); } @@ -2271,8 +2341,10 @@ retry: err = -ESRCH; else if (task->perf_event_ctxp[ctxn]) err = -EAGAIN; - else + else { + ++ctx->pin_count; rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); + } mutex_unlock(&task->perf_event_mutex); if (unlikely(err)) { @@ -5950,10 +6022,10 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event_context *gctx = group_leader->ctx; mutex_lock(&gctx->mutex); - perf_event_remove_from_context(group_leader); + perf_remove_from_context(group_leader); list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { - perf_event_remove_from_context(sibling); + perf_remove_from_context(sibling); put_ctx(gctx); } mutex_unlock(&gctx->mutex); @@ -5976,6 +6048,7 @@ SYSCALL_DEFINE5(perf_event_open, perf_install_in_context(ctx, event, cpu); ++ctx->generation; + perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); event->owner = current; @@ -6001,6 +6074,7 @@ SYSCALL_DEFINE5(perf_event_open, return event_fd; err_context: + perf_unpin_context(ctx); put_ctx(ctx); err_alloc: free_event(event); @@ -6051,6 +6125,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, mutex_lock(&ctx->mutex); perf_install_in_context(ctx, event, cpu); ++ctx->generation; + perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); return event; @@ -6104,7 +6179,7 @@ __perf_event_exit_task(struct perf_event *child_event, { struct perf_event *parent_event; - perf_event_remove_from_context(child_event); + perf_remove_from_context(child_event); parent_event = child_event->parent; /* @@ -6411,7 +6486,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, return 0; } - child_ctx = child->perf_event_ctxp[ctxn]; + child_ctx = child->perf_event_ctxp[ctxn]; if (!child_ctx) { /* * This is executed from the parent task context, so @@ -6526,6 +6601,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn) mutex_unlock(&parent_ctx->mutex); perf_unpin_context(parent_ctx); + put_ctx(parent_ctx); return ret; } @@ -6595,9 +6671,9 @@ static void __perf_event_exit_context(void *__info) perf_pmu_rotate_stop(ctx->pmu); list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) - __perf_event_remove_from_context(event); + __perf_remove_from_context(event); list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) - __perf_event_remove_from_context(event); + __perf_remove_from_context(event); } static void perf_event_exit_cpu_context(int cpu) diff --git a/kernel/sched.c b/kernel/sched.c index 18d38e4ec7ba..31cb5d5e1aac 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2265,27 +2265,6 @@ void kick_process(struct task_struct *p) EXPORT_SYMBOL_GPL(kick_process); #endif /* CONFIG_SMP */ -/** - * task_oncpu_function_call - call a function on the cpu on which a task runs - * @p: the task to evaluate - * @func: the function to be called - * @info: the function call argument - * - * Calls the function @func when the task is currently running. This might - * be on the current CPU, which just calls the function directly - */ -void task_oncpu_function_call(struct task_struct *p, - void (*func) (void *info), void *info) -{ - int cpu; - - preempt_disable(); - cpu = task_cpu(p); - if (task_curr(p)) - smp_call_function_single(cpu, func, info, 1); - preempt_enable(); -} - #ifdef CONFIG_SMP /* * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. @@ -2776,9 +2755,12 @@ static inline void prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { + sched_info_switch(prev, next); + perf_event_task_sched_out(prev, next); fire_sched_out_preempt_notifiers(prev, next); prepare_lock_switch(rq, next); prepare_arch_switch(next); + trace_sched_switch(prev, next); } /** @@ -2911,7 +2893,7 @@ context_switch(struct rq *rq, struct task_struct *prev, struct mm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); - trace_sched_switch(prev, next); + mm = next->mm; oldmm = prev->active_mm; /* @@ -3989,9 +3971,6 @@ need_resched_nonpreemptible: rq->skip_clock_update = 0; if (likely(prev != next)) { - sched_info_switch(prev, next); - perf_event_task_sched_out(prev, next); - rq->nr_switches++; rq->curr = next; ++*switch_count; From 725e7580aaf98e9f7b22b8ccfc640ad0c09e2b08 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 1 Feb 2011 09:47:15 -0500 Subject: [PATCH 165/706] sched: Check the right ->nr_running in yield_task_fair() With CONFIG_FAIR_GROUP_SCHED, each task_group has its own cfs_rq. Yielding to a task from another cfs_rq may be worthwhile, since a process calling yield typically cannot use the CPU right now. Therefor, we want to check the per-cpu nr_running, not the cgroup local one. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra LKML-Reference: <20110201094715.798c4f86@annuminas.surriel.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 55040f3938d8..4de9905228c4 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1331,7 +1331,7 @@ static void yield_task_fair(struct rq *rq) /* * Are we the only task in the tree? */ - if (unlikely(cfs_rq->nr_running == 1)) + if (unlikely(rq->nr_running == 1)) return; clear_buddies(cfs_rq, se); From 2c13c919d9e9a3db9896143a501f83dcbbe1ced4 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 1 Feb 2011 09:48:37 -0500 Subject: [PATCH 166/706] sched: Limit the scope of clear_buddies The clear_buddies function does not seem to play well with the concept of hierarchical runqueues. In the following tree, task groups are represented by 'G', tasks by 'T', next by 'n' and last by 'l'. (nl) / \ G(nl) G / \ \ T(l) T(n) T This situation can arise when a task is woken up T(n), and the previously running task T(l) is marked last. When clear_buddies is called from either T(l) or T(n), the next and last buddies of the group G(nl) will be cleared. This is not the desired result, since we would like to be able to find the other type of buddy in many cases. This especially a worry when implementing yield_task_fair through the buddy system. The fix is simple: only clear the buddy type that the task itself is indicated to be. As an added bonus, we stop walking up the tree when the buddy has already been cleared or pointed elsewhere. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra LKML-Reference: <20110201094837.6b0962a9@annuminas.surriel.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4de9905228c4..a785e08202cf 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -995,19 +995,35 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) list_add_leaf_cfs_rq(cfs_rq); } -static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) +static void __clear_buddies_last(struct sched_entity *se) { - if (!se || cfs_rq->last == se) - cfs_rq->last = NULL; + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + if (cfs_rq->last == se) + cfs_rq->last = NULL; + else + break; + } +} - if (!se || cfs_rq->next == se) - cfs_rq->next = NULL; +static void __clear_buddies_next(struct sched_entity *se) +{ + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + if (cfs_rq->next == se) + cfs_rq->next = NULL; + else + break; + } } static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) { - for_each_sched_entity(se) - __clear_buddies(cfs_rq_of(se), se); + if (cfs_rq->last == se) + __clear_buddies_last(se); + + if (cfs_rq->next == se) + __clear_buddies_next(se); } static void From ac53db596cc08ecb8040cfb6f71ae40c6f2041c4 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 1 Feb 2011 09:51:03 -0500 Subject: [PATCH 167/706] sched: Use a buddy to implement yield_task_fair() Use the buddy mechanism to implement yield_task_fair. This allows us to skip onto the next highest priority se at every level in the CFS tree, unless doing so would introduce gross unfairness in CPU time distribution. We order the buddy selection in pick_next_entity to check yield first, then last, then next. We need next to be able to override yield, because it is possible for the "next" and "yield" task to be different processen in the same sub-tree of the CFS tree. When they are, we need to go into that sub-tree regardless of the "yield" hint, and pick the correct entity once we get to the right level. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra LKML-Reference: <20110201095103.3a79e92a@annuminas.surriel.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 - kernel/sched.c | 2 +- kernel/sched_debug.c | 2 +- kernel/sched_fair.c | 148 +++++++++++++++++++++++++----------------- kernel/sysctl.c | 7 -- 5 files changed, 90 insertions(+), 71 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 0542774914d4..4e9fad271c30 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1942,8 +1942,6 @@ int sched_rt_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -extern unsigned int sysctl_sched_compat_yield; - #ifdef CONFIG_SCHED_AUTOGROUP extern unsigned int sysctl_sched_autogroup_enabled; diff --git a/kernel/sched.c b/kernel/sched.c index 477e1bcc63f9..ae5e1a19b9d6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -324,7 +324,7 @@ struct cfs_rq { * 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ - struct sched_entity *curr, *next, *last; + struct sched_entity *curr, *next, *last, *skip; unsigned int nr_spread_over; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index eb6cb8edd075..7bacd83a4158 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -179,7 +179,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) raw_spin_lock_irqsave(&rq->lock, flags); if (cfs_rq->rb_leftmost) - MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; + MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; last = __pick_last_entity(cfs_rq); if (last) max_vruntime = last->vruntime; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a785e08202cf..c0fbeb992833 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -68,14 +68,6 @@ static unsigned int sched_nr_latency = 8; */ unsigned int sysctl_sched_child_runs_first __read_mostly; -/* - * sys_sched_yield() compat mode - * - * This option switches the agressive yield implementation of the - * old scheduler back on. - */ -unsigned int __read_mostly sysctl_sched_compat_yield; - /* * SCHED_OTHER wake-up granularity. * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) @@ -419,7 +411,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) rb_erase(&se->run_node, &cfs_rq->tasks_timeline); } -static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) +static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) { struct rb_node *left = cfs_rq->rb_leftmost; @@ -429,6 +421,17 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) return rb_entry(left, struct sched_entity, run_node); } +static struct sched_entity *__pick_next_entity(struct sched_entity *se) +{ + struct rb_node *next = rb_next(&se->run_node); + + if (!next) + return NULL; + + return rb_entry(next, struct sched_entity, run_node); +} + +#ifdef CONFIG_SCHED_DEBUG static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); @@ -443,7 +446,6 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) * Scheduling class statistics methods: */ -#ifdef CONFIG_SCHED_DEBUG int sched_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -1017,6 +1019,17 @@ static void __clear_buddies_next(struct sched_entity *se) } } +static void __clear_buddies_skip(struct sched_entity *se) +{ + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + if (cfs_rq->skip == se) + cfs_rq->skip = NULL; + else + break; + } +} + static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) { if (cfs_rq->last == se) @@ -1024,6 +1037,9 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) if (cfs_rq->next == se) __clear_buddies_next(se); + + if (cfs_rq->skip == se) + __clear_buddies_skip(se); } static void @@ -1099,7 +1115,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) return; if (cfs_rq->nr_running > 1) { - struct sched_entity *se = __pick_next_entity(cfs_rq); + struct sched_entity *se = __pick_first_entity(cfs_rq); s64 delta = curr->vruntime - se->vruntime; if (delta < 0) @@ -1143,13 +1159,27 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) static int wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); +/* + * Pick the next process, keeping these things in mind, in this order: + * 1) keep things fair between processes/task groups + * 2) pick the "next" process, since someone really wants that to run + * 3) pick the "last" process, for cache locality + * 4) do not run the "skip" process, if something else is available + */ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { - struct sched_entity *se = __pick_next_entity(cfs_rq); + struct sched_entity *se = __pick_first_entity(cfs_rq); struct sched_entity *left = se; - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) - se = cfs_rq->next; + /* + * Avoid running the skip buddy, if running something else can + * be done without getting too unfair. + */ + if (cfs_rq->skip == se) { + struct sched_entity *second = __pick_next_entity(se); + if (second && wakeup_preempt_entity(second, left) < 1) + se = second; + } /* * Prefer last buddy, try to return the CPU to a preempted task. @@ -1157,6 +1187,12 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) se = cfs_rq->last; + /* + * Someone really wants this to run. If it's not unfair, run it. + */ + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) + se = cfs_rq->next; + clear_buddies(cfs_rq, se); return se; @@ -1333,52 +1369,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) hrtick_update(rq); } -/* - * sched_yield() support is very simple - we dequeue and enqueue. - * - * If compat_yield is turned on then we requeue to the end of the tree. - */ -static void yield_task_fair(struct rq *rq) -{ - struct task_struct *curr = rq->curr; - struct cfs_rq *cfs_rq = task_cfs_rq(curr); - struct sched_entity *rightmost, *se = &curr->se; - - /* - * Are we the only task in the tree? - */ - if (unlikely(rq->nr_running == 1)) - return; - - clear_buddies(cfs_rq, se); - - if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { - update_rq_clock(rq); - /* - * Update run-time statistics of the 'current'. - */ - update_curr(cfs_rq); - - return; - } - /* - * Find the rightmost entry in the rbtree: - */ - rightmost = __pick_last_entity(cfs_rq); - /* - * Already in the rightmost position? - */ - if (unlikely(!rightmost || entity_before(rightmost, se))) - return; - - /* - * Minimally necessary key value to be last in the tree: - * Upon rescheduling, sched_class::put_prev_task() will place - * 'current' within the tree based on its new key value. - */ - se->vruntime = rightmost->vruntime + 1; -} - #ifdef CONFIG_SMP static void task_waking_fair(struct rq *rq, struct task_struct *p) @@ -1849,6 +1839,14 @@ static void set_next_buddy(struct sched_entity *se) } } +static void set_skip_buddy(struct sched_entity *se) +{ + if (likely(task_of(se)->policy != SCHED_IDLE)) { + for_each_sched_entity(se) + cfs_rq_of(se)->skip = se; + } +} + /* * Preempt the current task with a newly woken task if needed: */ @@ -1947,6 +1945,36 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) } } +/* + * sched_yield() is very simple + * + * The magic of dealing with the ->skip buddy is in pick_next_entity. + */ +static void yield_task_fair(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + struct cfs_rq *cfs_rq = task_cfs_rq(curr); + struct sched_entity *se = &curr->se; + + /* + * Are we the only task in the tree? + */ + if (unlikely(rq->nr_running == 1)) + return; + + clear_buddies(cfs_rq, se); + + if (curr->policy != SCHED_BATCH) { + update_rq_clock(rq); + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); + } + + set_skip_buddy(se); +} + #ifdef CONFIG_SMP /************************************************** * Fair scheduling class load-balancing methods: diff --git a/kernel/sysctl.c b/kernel/sysctl.c index bc86bb32e126..cbfda7e2416e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -360,13 +360,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sched_rt_handler, }, - { - .procname = "sched_compat_yield", - .data = &sysctl_sched_compat_yield, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, #ifdef CONFIG_SCHED_AUTOGROUP { .procname = "sched_autogroup_enabled", From d95f412200652694e63e64bfd49f0ae274a54479 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 1 Feb 2011 09:50:51 -0500 Subject: [PATCH 168/706] sched: Add yield_to(task, preempt) functionality Currently only implemented for fair class tasks. Add a yield_to_task method() to the fair scheduling class. allowing the caller of yield_to() to accelerate another thread in it's thread group, task group. Implemented via a scheduler hint, using cfs_rq->next to encourage the target being selected. We can rely on pick_next_entity to keep things fair, so noone can accelerate a thread that has already used its fair share of CPU time. This also means callers should only call yield_to when they really mean it. Calling it too often can result in the scheduler just ignoring the hint. Signed-off-by: Rik van Riel Signed-off-by: Marcelo Tosatti Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <20110201095051.4ddb7738@annuminas.surriel.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 + kernel/sched.c | 85 +++++++++++++++++++++++++++++++++++++++++++ kernel/sched_fair.c | 20 ++++++++++ 3 files changed, 107 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index 4e9fad271c30..c88b3bfbd09e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1058,6 +1058,7 @@ struct sched_class { void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); void (*yield_task) (struct rq *rq); + bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); @@ -1972,6 +1973,7 @@ static inline int rt_mutex_getprio(struct task_struct *p) # define rt_mutex_adjust_pi(p) do { } while (0) #endif +extern bool yield_to(struct task_struct *p, bool preempt); extern void set_user_nice(struct task_struct *p, long nice); extern int task_prio(const struct task_struct *p); extern int task_nice(const struct task_struct *p); diff --git a/kernel/sched.c b/kernel/sched.c index ae5e1a19b9d6..2effcb71a478 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1686,6 +1686,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) __release(rq2->lock); } +#else /* CONFIG_SMP */ + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + BUG_ON(rq1 != rq2); + raw_spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + BUG_ON(rq1 != rq2); + raw_spin_unlock(&rq1->lock); + __release(rq2->lock); +} + #endif static void calc_load_account_idle(struct rq *this_rq); @@ -5448,6 +5481,58 @@ void __sched yield(void) } EXPORT_SYMBOL(yield); +/** + * yield_to - yield the current processor to another thread in + * your thread group, or accelerate that thread toward the + * processor it's on. + * + * It's the caller's job to ensure that the target task struct + * can't go away on us before we can do any checks. + * + * Returns true if we indeed boosted the target task. + */ +bool __sched yield_to(struct task_struct *p, bool preempt) +{ + struct task_struct *curr = current; + struct rq *rq, *p_rq; + unsigned long flags; + bool yielded = 0; + + local_irq_save(flags); + rq = this_rq(); + +again: + p_rq = task_rq(p); + double_rq_lock(rq, p_rq); + while (task_rq(p) != p_rq) { + double_rq_unlock(rq, p_rq); + goto again; + } + + if (!curr->sched_class->yield_to_task) + goto out; + + if (curr->sched_class != p->sched_class) + goto out; + + if (task_running(p_rq, p) || p->state) + goto out; + + yielded = curr->sched_class->yield_to_task(rq, p, preempt); + if (yielded) + schedstat_inc(rq, yld_count); + +out: + double_rq_unlock(rq, p_rq); + local_irq_restore(flags); + + if (yielded) + schedule(); + + return yielded; +} +EXPORT_SYMBOL_GPL(yield_to); + /* * This task is about to go to sleep on IO. Increment rq->nr_iowait so * that process accounting knows that this is a task in IO wait state. diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c0fbeb992833..027024694043 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1975,6 +1975,25 @@ static void yield_task_fair(struct rq *rq) set_skip_buddy(se); } +static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt) +{ + struct sched_entity *se = &p->se; + + if (!se->on_rq) + return false; + + /* Tell the scheduler that we'd really like pse to run next. */ + set_next_buddy(se); + + /* Make p's CPU reschedule; pick_next_entity takes care of fairness. */ + if (preempt) + resched_task(rq->curr); + + yield_task_fair(rq); + + return true; +} + #ifdef CONFIG_SMP /************************************************** * Fair scheduling class load-balancing methods: @@ -4243,6 +4262,7 @@ static const struct sched_class fair_sched_class = { .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, + .yield_to_task = yield_to_task_fair, .check_preempt_curr = check_preempt_wakeup, From 04bea68b2f0eeebb089ecc67b618795925268b4a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 24 Jan 2011 09:58:55 +0530 Subject: [PATCH 169/706] of/pci: move of_irq_map_pci() into generic code There is a tiny difference between PPC32 and PPC64. Microblaze uses the PPC32 variant. Signed-off-by: Sebastian Andrzej Siewior [grant.likely@secretlab.ca: Added comment to #endif, moved documentation block to function implementation, fixed for non ppc and microblaze compiles] Signed-off-by: Grant Likely --- arch/microblaze/include/asm/pci-bridge.h | 12 ++++ arch/microblaze/include/asm/prom.h | 15 ---- arch/microblaze/kernel/prom_parse.c | 77 -------------------- arch/microblaze/pci/pci-common.c | 1 + arch/powerpc/include/asm/pci-bridge.h | 10 +++ arch/powerpc/include/asm/prom.h | 15 ---- arch/powerpc/kernel/pci-common.c | 1 + arch/powerpc/kernel/prom_parse.c | 84 ---------------------- drivers/of/Kconfig | 6 ++ drivers/of/Makefile | 1 + drivers/of/of_pci.c | 91 ++++++++++++++++++++++++ include/linux/of_pci.h | 9 +++ 12 files changed, 131 insertions(+), 191 deletions(-) create mode 100644 drivers/of/of_pci.c create mode 100644 include/linux/of_pci.h diff --git a/arch/microblaze/include/asm/pci-bridge.h b/arch/microblaze/include/asm/pci-bridge.h index 0c68764ab547..10717669e0c2 100644 --- a/arch/microblaze/include/asm/pci-bridge.h +++ b/arch/microblaze/include/asm/pci-bridge.h @@ -104,11 +104,22 @@ struct pci_controller { int global_number; /* PCI domain number */ }; +#ifdef CONFIG_PCI static inline struct pci_controller *pci_bus_to_host(const struct pci_bus *bus) { return bus->sysdata; } +static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus) +{ + struct pci_controller *host; + + if (bus->self) + return pci_device_to_OF_node(bus->self); + host = pci_bus_to_host(bus); + return host ? host->dn : NULL; +} + static inline int isa_vaddr_is_ioport(void __iomem *address) { /* No specific ISA handling on ppc32 at this stage, it @@ -116,6 +127,7 @@ static inline int isa_vaddr_is_ioport(void __iomem *address) */ return 0; } +#endif /* CONFIG_PCI */ /* These are used for config access before all the PCI probing has been done. */ diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h index 2e72af078b05..d0890d36ef61 100644 --- a/arch/microblaze/include/asm/prom.h +++ b/arch/microblaze/include/asm/prom.h @@ -64,21 +64,6 @@ extern void kdump_move_device_tree(void); /* CPU OF node matching */ struct device_node *of_get_cpu_node(int cpu, unsigned int *thread); -/** - * of_irq_map_pci - Resolve the interrupt for a PCI device - * @pdev: the device whose interrupt is to be resolved - * @out_irq: structure of_irq filled by this function - * - * This function resolves the PCI interrupt for a given PCI device. If a - * device-node exists for a given pci_dev, it will use normal OF tree - * walking. If not, it will implement standard swizzling and walk up the - * PCI tree until an device-node is found, at which point it will finish - * resolving using the OF tree walking. - */ -struct pci_dev; -struct of_irq; -extern int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq); - #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/microblaze/kernel/prom_parse.c b/arch/microblaze/kernel/prom_parse.c index 9ae24f4b882b..47187cc2cf00 100644 --- a/arch/microblaze/kernel/prom_parse.c +++ b/arch/microblaze/kernel/prom_parse.c @@ -2,88 +2,11 @@ #include #include -#include #include #include #include #include #include -#include - -#ifdef CONFIG_PCI -int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq) -{ - struct device_node *dn, *ppnode; - struct pci_dev *ppdev; - u32 lspec; - u32 laddr[3]; - u8 pin; - int rc; - - /* Check if we have a device node, if yes, fallback to standard OF - * parsing - */ - dn = pci_device_to_OF_node(pdev); - if (dn) - return of_irq_map_one(dn, 0, out_irq); - - /* Ok, we don't, time to have fun. Let's start by building up an - * interrupt spec. we assume #interrupt-cells is 1, which is standard - * for PCI. If you do different, then don't use that routine. - */ - rc = pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin); - if (rc != 0) - return rc; - /* No pin, exit */ - if (pin == 0) - return -ENODEV; - - /* Now we walk up the PCI tree */ - lspec = pin; - for (;;) { - /* Get the pci_dev of our parent */ - ppdev = pdev->bus->self; - - /* Ouch, it's a host bridge... */ - if (ppdev == NULL) { - struct pci_controller *host; - host = pci_bus_to_host(pdev->bus); - ppnode = host ? host->dn : NULL; - /* No node for host bridge ? give up */ - if (ppnode == NULL) - return -EINVAL; - } else - /* We found a P2P bridge, check if it has a node */ - ppnode = pci_device_to_OF_node(ppdev); - - /* Ok, we have found a parent with a device-node, hand over to - * the OF parsing code. - * We build a unit address from the linux device to be used for - * resolution. Note that we use the linux bus number which may - * not match your firmware bus numbering. - * Fortunately, in most cases, interrupt-map-mask doesn't - * include the bus number as part of the matching. - * You should still be careful about that though if you intend - * to rely on this function (you ship a firmware that doesn't - * create device nodes for all PCI devices). - */ - if (ppnode) - break; - - /* We can only get here if we hit a P2P bridge with no node, - * let's do standard swizzling and try again - */ - lspec = pci_swizzle_interrupt_pin(pdev, lspec); - pdev = ppdev; - } - - laddr[0] = (pdev->bus->number << 16) - | (pdev->devfn << 8); - laddr[1] = laddr[2] = 0; - return of_irq_map_raw(ppnode, &lspec, 1, laddr, out_irq); -} -EXPORT_SYMBOL_GPL(of_irq_map_pci); -#endif /* CONFIG_PCI */ void of_parse_dma_window(struct device_node *dn, const void *dma_window_prop, unsigned long *busno, unsigned long *phys, unsigned long *size) diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c index e363615d6798..1e01a1253631 100644 --- a/arch/microblaze/pci/pci-common.c +++ b/arch/microblaze/pci/pci-common.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 51e9e6f90d12..edeb80fdd2c3 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -171,6 +171,16 @@ static inline struct pci_controller *pci_bus_to_host(const struct pci_bus *bus) return bus->sysdata; } +static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus) +{ + struct pci_controller *host; + + if (bus->self) + return pci_device_to_OF_node(bus->self); + host = pci_bus_to_host(bus); + return host ? host->dn : NULL; +} + static inline int isa_vaddr_is_ioport(void __iomem *address) { /* No specific ISA handling on ppc32 at this stage, it diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h index d72757585595..c189aa5fe1f4 100644 --- a/arch/powerpc/include/asm/prom.h +++ b/arch/powerpc/include/asm/prom.h @@ -70,21 +70,6 @@ static inline int of_node_to_nid(struct device_node *device) { return 0; } #endif #define of_node_to_nid of_node_to_nid -/** - * of_irq_map_pci - Resolve the interrupt for a PCI device - * @pdev: the device whose interrupt is to be resolved - * @out_irq: structure of_irq filled by this function - * - * This function resolves the PCI interrupt for a given PCI device. If a - * device-node exists for a given pci_dev, it will use normal OF tree - * walking. If not, it will implement standard swizzling and walk up the - * PCI tree until an device-node is found, at which point it will finish - * resolving using the OF tree walking. - */ -struct pci_dev; -struct of_irq; -extern int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq); - extern void of_instantiate_rtc(void); /* These includes are put at the bottom because they may contain things diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 10a44e68ef11..eb341be9a4d9 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/powerpc/kernel/prom_parse.c b/arch/powerpc/kernel/prom_parse.c index c2b7a07cc3d3..47187cc2cf00 100644 --- a/arch/powerpc/kernel/prom_parse.c +++ b/arch/powerpc/kernel/prom_parse.c @@ -2,95 +2,11 @@ #include #include -#include #include #include #include #include #include -#include - -#ifdef CONFIG_PCI -int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq) -{ - struct device_node *dn, *ppnode; - struct pci_dev *ppdev; - u32 lspec; - u32 laddr[3]; - u8 pin; - int rc; - - /* Check if we have a device node, if yes, fallback to standard OF - * parsing - */ - dn = pci_device_to_OF_node(pdev); - if (dn) { - rc = of_irq_map_one(dn, 0, out_irq); - if (!rc) - return rc; - } - - /* Ok, we don't, time to have fun. Let's start by building up an - * interrupt spec. we assume #interrupt-cells is 1, which is standard - * for PCI. If you do different, then don't use that routine. - */ - rc = pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin); - if (rc != 0) - return rc; - /* No pin, exit */ - if (pin == 0) - return -ENODEV; - - /* Now we walk up the PCI tree */ - lspec = pin; - for (;;) { - /* Get the pci_dev of our parent */ - ppdev = pdev->bus->self; - - /* Ouch, it's a host bridge... */ - if (ppdev == NULL) { -#ifdef CONFIG_PPC64 - ppnode = pci_bus_to_OF_node(pdev->bus); -#else - struct pci_controller *host; - host = pci_bus_to_host(pdev->bus); - ppnode = host ? host->dn : NULL; -#endif - /* No node for host bridge ? give up */ - if (ppnode == NULL) - return -EINVAL; - } else - /* We found a P2P bridge, check if it has a node */ - ppnode = pci_device_to_OF_node(ppdev); - - /* Ok, we have found a parent with a device-node, hand over to - * the OF parsing code. - * We build a unit address from the linux device to be used for - * resolution. Note that we use the linux bus number which may - * not match your firmware bus numbering. - * Fortunately, in most cases, interrupt-map-mask doesn't include - * the bus number as part of the matching. - * You should still be careful about that though if you intend - * to rely on this function (you ship a firmware that doesn't - * create device nodes for all PCI devices). - */ - if (ppnode) - break; - - /* We can only get here if we hit a P2P bridge with no node, - * let's do standard swizzling and try again - */ - lspec = pci_swizzle_interrupt_pin(pdev, lspec); - pdev = ppdev; - } - - laddr[0] = (pdev->bus->number << 16) - | (pdev->devfn << 8); - laddr[1] = laddr[2] = 0; - return of_irq_map_raw(ppnode, &lspec, 1, laddr, out_irq); -} -EXPORT_SYMBOL_GPL(of_irq_map_pci); -#endif /* CONFIG_PCI */ void of_parse_dma_window(struct device_node *dn, const void *dma_window_prop, unsigned long *busno, unsigned long *phys, unsigned long *size) diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig index 3c6e100a3ad0..efabbf9dd607 100644 --- a/drivers/of/Kconfig +++ b/drivers/of/Kconfig @@ -69,4 +69,10 @@ config OF_MDIO help OpenFirmware MDIO bus (Ethernet PHY) accessors +config OF_PCI + def_tristate PCI + depends on PCI && (PPC || MICROBLAZE) + help + OpenFirmware PCI bus accessors + endmenu # OF diff --git a/drivers/of/Makefile b/drivers/of/Makefile index 3ab21a0a4907..f7861ed2f287 100644 --- a/drivers/of/Makefile +++ b/drivers/of/Makefile @@ -9,3 +9,4 @@ obj-$(CONFIG_OF_I2C) += of_i2c.o obj-$(CONFIG_OF_NET) += of_net.o obj-$(CONFIG_OF_SPI) += of_spi.o obj-$(CONFIG_OF_MDIO) += of_mdio.o +obj-$(CONFIG_OF_PCI) += of_pci.o diff --git a/drivers/of/of_pci.c b/drivers/of/of_pci.c new file mode 100644 index 000000000000..314535fa32c1 --- /dev/null +++ b/drivers/of/of_pci.c @@ -0,0 +1,91 @@ +#include +#include +#include + +/** + * of_irq_map_pci - Resolve the interrupt for a PCI device + * @pdev: the device whose interrupt is to be resolved + * @out_irq: structure of_irq filled by this function + * + * This function resolves the PCI interrupt for a given PCI device. If a + * device-node exists for a given pci_dev, it will use normal OF tree + * walking. If not, it will implement standard swizzling and walk up the + * PCI tree until an device-node is found, at which point it will finish + * resolving using the OF tree walking. + */ +int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq) +{ + struct device_node *dn, *ppnode; + struct pci_dev *ppdev; + u32 lspec; + __be32 lspec_be; + __be32 laddr[3]; + u8 pin; + int rc; + + /* Check if we have a device node, if yes, fallback to standard + * device tree parsing + */ + dn = pci_device_to_OF_node(pdev); + if (dn) { + rc = of_irq_map_one(dn, 0, out_irq); + if (!rc) + return rc; + } + + /* Ok, we don't, time to have fun. Let's start by building up an + * interrupt spec. we assume #interrupt-cells is 1, which is standard + * for PCI. If you do different, then don't use that routine. + */ + rc = pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin); + if (rc != 0) + return rc; + /* No pin, exit */ + if (pin == 0) + return -ENODEV; + + /* Now we walk up the PCI tree */ + lspec = pin; + for (;;) { + /* Get the pci_dev of our parent */ + ppdev = pdev->bus->self; + + /* Ouch, it's a host bridge... */ + if (ppdev == NULL) { + ppnode = pci_bus_to_OF_node(pdev->bus); + + /* No node for host bridge ? give up */ + if (ppnode == NULL) + return -EINVAL; + } else { + /* We found a P2P bridge, check if it has a node */ + ppnode = pci_device_to_OF_node(ppdev); + } + + /* Ok, we have found a parent with a device-node, hand over to + * the OF parsing code. + * We build a unit address from the linux device to be used for + * resolution. Note that we use the linux bus number which may + * not match your firmware bus numbering. + * Fortunately, in most cases, interrupt-map-mask doesn't + * include the bus number as part of the matching. + * You should still be careful about that though if you intend + * to rely on this function (you ship a firmware that doesn't + * create device nodes for all PCI devices). + */ + if (ppnode) + break; + + /* We can only get here if we hit a P2P bridge with no node, + * let's do standard swizzling and try again + */ + lspec = pci_swizzle_interrupt_pin(pdev, lspec); + pdev = ppdev; + } + + lspec_be = cpu_to_be32(lspec); + laddr[0] = cpu_to_be32((pdev->bus->number << 16) | (pdev->devfn << 8)); + laddr[1] = laddr[2] = cpu_to_be32(0); + return of_irq_map_raw(ppnode, &lspec_be, 1, laddr, out_irq); +} +EXPORT_SYMBOL_GPL(of_irq_map_pci); diff --git a/include/linux/of_pci.h b/include/linux/of_pci.h new file mode 100644 index 000000000000..85a27b650d76 --- /dev/null +++ b/include/linux/of_pci.h @@ -0,0 +1,9 @@ +#ifndef __OF_PCI_H +#define __OF_PCI_H + +#include + +struct pci_dev; +struct of_irq; +int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq); +#endif From 764328d3209dd81b02a55722556b07b6f35e3ca0 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 4 Feb 2011 07:33:24 -0200 Subject: [PATCH 170/706] perf top: Remove superfluous name_len field From the sym_entry struct, struct symbol already has this field. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-top.c | 3 --- tools/perf/util/top.c | 5 +++-- tools/perf/util/top.h | 1 - 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 104de9ab314c..154e088588bc 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -787,9 +787,6 @@ static int symbol_filter(struct map *map, struct symbol *sym) } } - if (!syme->skip) - syme->name_len = strlen(sym->name); - return 0; } diff --git a/tools/perf/util/top.c b/tools/perf/util/top.c index 1d2e2652cd68..70a9c13f4ad5 100644 --- a/tools/perf/util/top.c +++ b/tools/perf/util/top.c @@ -200,6 +200,7 @@ void perf_top__find_widths(struct perf_top *top, struct rb_root *root, for (nd = rb_first(root); nd; nd = rb_next(nd)) { struct sym_entry *syme = rb_entry(nd, struct sym_entry, rb_node); + struct symbol *sym = sym_entry__symbol(syme); if (++printed > top->print_entries || (int)syme->snap_count < top->count_filter) @@ -211,7 +212,7 @@ void perf_top__find_widths(struct perf_top *top, struct rb_root *root, if (syme->map->dso->short_name_len > *dso_short_width) *dso_short_width = syme->map->dso->short_name_len; - if (syme->name_len > *sym_width) - *sym_width = syme->name_len; + if (sym->namelen > *sym_width) + *sym_width = sym->namelen; } } diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h index 611370fa7df8..500950818d2f 100644 --- a/tools/perf/util/top.h +++ b/tools/perf/util/top.h @@ -31,7 +31,6 @@ struct sym_entry { unsigned long snap_count; double weight; int skip; - u16 name_len; u8 origin; struct map *map; struct sym_entry_source *src; From 78f7defedbb4da73b9a07635c357c1afcaa55c8f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 4 Feb 2011 09:45:46 -0200 Subject: [PATCH 171/706] perf annotate: Move annotate functions to util/ They will be used by perf top, so that we have just one set of routines to do annotation. Rename "struct sym_priv" to "struct annotation", etc, to clarify this code a bit. Rename "struct sym_ext" to "struct source_line", to give it a meaningful name, that clarifies that it is a the result of an addr2line call, that is sorted by percentage one particular source code line appeared in the annotation. And since we're moving things around also rename 'sym_hist->ip' to 'sym_hist->addr' as we want to do data structure annotation at some point. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile | 2 + tools/perf/builtin-annotate.c | 255 +------------- tools/perf/builtin-report.c | 3 +- tools/perf/util/annotate.c | 467 +++++++++++++++++++++++++ tools/perf/util/annotate.h | 65 ++++ tools/perf/util/hist.c | 219 +----------- tools/perf/util/hist.h | 27 -- tools/perf/util/ui/browsers/annotate.c | 43 ++- 8 files changed, 578 insertions(+), 503 deletions(-) create mode 100644 tools/perf/util/annotate.c create mode 100644 tools/perf/util/annotate.h diff --git a/tools/perf/Makefile b/tools/perf/Makefile index 4c9499cb4398..be3eb1dc9a5a 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -401,6 +401,7 @@ LIB_H += util/include/dwarf-regs.h LIB_H += util/include/asm/dwarf2.h LIB_H += util/include/asm/cpufeature.h LIB_H += perf.h +LIB_H += util/annotate.h LIB_H += util/cache.h LIB_H += util/callchain.h LIB_H += util/build-id.h @@ -444,6 +445,7 @@ LIB_H += $(ARCH_INCLUDE) LIB_OBJS += $(OUTPUT)util/abspath.o LIB_OBJS += $(OUTPUT)util/alias.o +LIB_OBJS += $(OUTPUT)util/annotate.o LIB_OBJS += $(OUTPUT)util/build-id.o LIB_OBJS += $(OUTPUT)util/config.o LIB_OBJS += $(OUTPUT)util/ctype.o diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index cd9dec46c19f..9072ef44cfcb 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -9,6 +9,7 @@ #include "util/util.h" +#include "util/util.h" #include "util/color.h" #include #include "util/cache.h" @@ -18,6 +19,7 @@ #include "perf.h" #include "util/debug.h" +#include "util/annotate.h" #include "util/event.h" #include "util/parse-options.h" #include "util/parse-events.h" @@ -79,245 +81,10 @@ static int process_sample_event(union perf_event *event, return 0; } -static int objdump_line__print(struct objdump_line *self, - struct list_head *head, - struct hist_entry *he, u64 len) -{ - struct symbol *sym = he->ms.sym; - static const char *prev_line; - static const char *prev_color; - - if (self->offset != -1) { - const char *path = NULL; - unsigned int hits = 0; - double percent = 0.0; - const char *color; - struct sym_priv *priv = symbol__priv(sym); - struct sym_ext *sym_ext = priv->ext; - struct sym_hist *h = priv->hist; - s64 offset = self->offset; - struct objdump_line *next = objdump__get_next_ip_line(head, self); - - while (offset < (s64)len && - (next == NULL || offset < next->offset)) { - if (sym_ext) { - if (path == NULL) - path = sym_ext[offset].path; - percent += sym_ext[offset].percent; - } else - hits += h->ip[offset]; - - ++offset; - } - - if (sym_ext == NULL && h->sum) - percent = 100.0 * hits / h->sum; - - color = get_percent_color(percent); - - /* - * Also color the filename and line if needed, with - * the same color than the percentage. Don't print it - * twice for close colored ip with the same filename:line - */ - if (path) { - if (!prev_line || strcmp(prev_line, path) - || color != prev_color) { - color_fprintf(stdout, color, " %s", path); - prev_line = path; - prev_color = color; - } - } - - color_fprintf(stdout, color, " %7.2f", percent); - printf(" : "); - color_fprintf(stdout, PERF_COLOR_BLUE, "%s\n", self->line); - } else { - if (!*self->line) - printf(" :\n"); - else - printf(" : %s\n", self->line); - } - - return 0; -} - -static struct rb_root root_sym_ext; - -static void insert_source_line(struct sym_ext *sym_ext) -{ - struct sym_ext *iter; - struct rb_node **p = &root_sym_ext.rb_node; - struct rb_node *parent = NULL; - - while (*p != NULL) { - parent = *p; - iter = rb_entry(parent, struct sym_ext, node); - - if (sym_ext->percent > iter->percent) - p = &(*p)->rb_left; - else - p = &(*p)->rb_right; - } - - rb_link_node(&sym_ext->node, parent, p); - rb_insert_color(&sym_ext->node, &root_sym_ext); -} - -static void free_source_line(struct hist_entry *he, int len) -{ - struct sym_priv *priv = symbol__priv(he->ms.sym); - struct sym_ext *sym_ext = priv->ext; - int i; - - if (!sym_ext) - return; - - for (i = 0; i < len; i++) - free(sym_ext[i].path); - free(sym_ext); - - priv->ext = NULL; - root_sym_ext = RB_ROOT; -} - -/* Get the filename:line for the colored entries */ -static void -get_source_line(struct hist_entry *he, int len, const char *filename) -{ - struct symbol *sym = he->ms.sym; - u64 start; - int i; - char cmd[PATH_MAX * 2]; - struct sym_ext *sym_ext; - struct sym_priv *priv = symbol__priv(sym); - struct sym_hist *h = priv->hist; - - if (!h->sum) - return; - - sym_ext = priv->ext = calloc(len, sizeof(struct sym_ext)); - if (!priv->ext) - return; - - start = he->ms.map->unmap_ip(he->ms.map, sym->start); - - for (i = 0; i < len; i++) { - char *path = NULL; - size_t line_len; - u64 offset; - FILE *fp; - - sym_ext[i].percent = 100.0 * h->ip[i] / h->sum; - if (sym_ext[i].percent <= 0.5) - continue; - - offset = start + i; - sprintf(cmd, "addr2line -e %s %016" PRIx64, filename, offset); - fp = popen(cmd, "r"); - if (!fp) - continue; - - if (getline(&path, &line_len, fp) < 0 || !line_len) - goto next; - - sym_ext[i].path = malloc(sizeof(char) * line_len + 1); - if (!sym_ext[i].path) - goto next; - - strcpy(sym_ext[i].path, path); - insert_source_line(&sym_ext[i]); - - next: - pclose(fp); - } -} - -static void print_summary(const char *filename) -{ - struct sym_ext *sym_ext; - struct rb_node *node; - - printf("\nSorted summary for file %s\n", filename); - printf("----------------------------------------------\n\n"); - - if (RB_EMPTY_ROOT(&root_sym_ext)) { - printf(" Nothing higher than %1.1f%%\n", MIN_GREEN); - return; - } - - node = rb_first(&root_sym_ext); - while (node) { - double percent; - const char *color; - char *path; - - sym_ext = rb_entry(node, struct sym_ext, node); - percent = sym_ext->percent; - color = get_percent_color(percent); - path = sym_ext->path; - - color_fprintf(stdout, color, " %7.2f %s", percent, path); - node = rb_next(node); - } -} - -static void hist_entry__print_hits(struct hist_entry *self) -{ - struct symbol *sym = self->ms.sym; - struct sym_priv *priv = symbol__priv(sym); - struct sym_hist *h = priv->hist; - u64 len = sym->end - sym->start, offset; - - for (offset = 0; offset < len; ++offset) - if (h->ip[offset] != 0) - printf("%*" PRIx64 ": %" PRIu64 "\n", BITS_PER_LONG / 2, - sym->start + offset, h->ip[offset]); - printf("%*s: %" PRIu64 "\n", BITS_PER_LONG / 2, "h->sum", h->sum); -} - static int hist_entry__tty_annotate(struct hist_entry *he) { - struct map *map = he->ms.map; - struct dso *dso = map->dso; - struct symbol *sym = he->ms.sym; - const char *filename = dso->long_name, *d_filename; - u64 len; - LIST_HEAD(head); - struct objdump_line *pos, *n; - - if (hist_entry__annotate(he, &head, 0) < 0) - return -1; - - if (full_paths) - d_filename = filename; - else - d_filename = basename(filename); - - len = sym->end - sym->start; - - if (print_line) { - get_source_line(he, len, filename); - print_summary(filename); - } - - printf("\n\n------------------------------------------------\n"); - printf(" Percent | Source code & Disassembly of %s\n", d_filename); - printf("------------------------------------------------\n"); - - if (verbose) - hist_entry__print_hits(he); - - list_for_each_entry_safe(pos, n, &head, node) { - objdump_line__print(pos, &head, he, len); - list_del(&pos->node); - objdump_line__free(pos); - } - - if (print_line) - free_source_line(he, len); - - return 0; + return symbol__tty_annotate(he->ms.sym, he->ms.map, + print_line, full_paths); } static void hists__find_annotations(struct hists *self) @@ -327,13 +94,13 @@ static void hists__find_annotations(struct hists *self) while (nd) { struct hist_entry *he = rb_entry(nd, struct hist_entry, rb_node); - struct sym_priv *priv; + struct annotation *notes; if (he->ms.sym == NULL || he->ms.map->dso->annotate_warned) goto find_next; - priv = symbol__priv(he->ms.sym); - if (priv->hist == NULL) { + notes = symbol__annotation(he->ms.sym); + if (notes->histogram == NULL) { find_next: if (key == KEY_LEFT) nd = rb_prev(nd); @@ -362,11 +129,11 @@ find_next: nd = rb_next(nd); /* * Since we have a hist_entry per IP for the same - * symbol, free he->ms.sym->hist to signal we already + * symbol, free he->ms.sym->histogram to signal we already * processed this symbol. */ - free(priv->hist); - priv->hist = NULL; + free(notes->histogram); + notes->histogram = NULL; } } } @@ -454,7 +221,7 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __used) setup_browser(true); - symbol_conf.priv_size = sizeof(struct sym_priv); + symbol_conf.priv_size = sizeof(struct annotation); symbol_conf.try_vmlinux_path = true; if (symbol__init() < 0) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 080937c3a656..91e4cdba933b 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -9,6 +9,7 @@ #include "util/util.h" +#include "util/annotate.h" #include "util/color.h" #include #include "util/cache.h" @@ -508,7 +509,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __used) * implementation. */ if (use_browser > 0) { - symbol_conf.priv_size = sizeof(struct sym_priv); + symbol_conf.priv_size = sizeof(struct annotation); /* * For searching by name on the "Browse map details". * providing it only in verbose mode not to bloat too diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c new file mode 100644 index 000000000000..9b25575b980c --- /dev/null +++ b/tools/perf/util/annotate.c @@ -0,0 +1,467 @@ +/* + * Copyright (C) 2011, Red Hat Inc, Arnaldo Carvalho de Melo + * + * Parts came from builtin-annotate.c, see those files for further + * copyright notes. + * + * Released under the GPL v2. (and only v2, not any later version) + */ + +#include "util.h" +#include "build-id.h" +#include "color.h" +#include "cache.h" +#include "symbol.h" +#include "debug.h" +#include "annotate.h" + +static int symbol__alloc_hist(struct symbol *sym) +{ + struct annotation *notes = symbol__annotation(sym); + const int size = (sizeof(*notes->histogram) + + (sym->end - sym->start) * sizeof(u64)); + + notes->histogram = zalloc(size); + return notes->histogram == NULL ? -1 : 0; +} + +int symbol__inc_addr_samples(struct symbol *sym, struct map *map, u64 addr) +{ + unsigned int sym_size, offset; + struct annotation *notes; + struct sym_hist *h; + + if (!sym || !map) + return 0; + + notes = symbol__annotation(sym); + if (notes->histogram == NULL && symbol__alloc_hist(sym) < 0) + return -ENOMEM; + + sym_size = sym->end - sym->start; + offset = addr - sym->start; + + pr_debug3("%s: addr=%#" PRIx64 "\n", __func__, map->unmap_ip(map, addr)); + + if (offset >= sym_size) + return 0; + + h = notes->histogram; + h->sum++; + h->addr[offset]++; + + pr_debug3("%#" PRIx64 " %s: period++ [addr: %#" PRIx64 ", %#" PRIx64 + "] => %" PRIu64 "\n", sym->start, sym->name, + addr, addr - sym->start, h->addr[offset]); + return 0; +} + +static struct objdump_line *objdump_line__new(s64 offset, char *line, size_t privsize) +{ + struct objdump_line *self = malloc(sizeof(*self) + privsize); + + if (self != NULL) { + self->offset = offset; + self->line = line; + } + + return self; +} + +void objdump_line__free(struct objdump_line *self) +{ + free(self->line); + free(self); +} + +static void objdump__add_line(struct list_head *head, struct objdump_line *line) +{ + list_add_tail(&line->node, head); +} + +struct objdump_line *objdump__get_next_ip_line(struct list_head *head, + struct objdump_line *pos) +{ + list_for_each_entry_continue(pos, head, node) + if (pos->offset >= 0) + return pos; + + return NULL; +} + +static void objdump_line__print(struct objdump_line *oline, + struct list_head *head, + struct symbol *sym, u64 len) +{ + static const char *prev_line; + static const char *prev_color; + + if (oline->offset != -1) { + const char *path = NULL; + unsigned int hits = 0; + double percent = 0.0; + const char *color; + struct annotation *notes = symbol__annotation(sym); + struct source_line *src_line = notes->src_line; + struct sym_hist *h = notes->histogram; + s64 offset = oline->offset; + struct objdump_line *next = objdump__get_next_ip_line(head, oline); + + while (offset < (s64)len && + (next == NULL || offset < next->offset)) { + if (src_line) { + if (path == NULL) + path = src_line[offset].path; + percent += src_line[offset].percent; + } else + hits += h->addr[offset]; + + ++offset; + } + + if (src_line == NULL && h->sum) + percent = 100.0 * hits / h->sum; + + color = get_percent_color(percent); + + /* + * Also color the filename and line if needed, with + * the same color than the percentage. Don't print it + * twice for close colored addr with the same filename:line + */ + if (path) { + if (!prev_line || strcmp(prev_line, path) + || color != prev_color) { + color_fprintf(stdout, color, " %s", path); + prev_line = path; + prev_color = color; + } + } + + color_fprintf(stdout, color, " %7.2f", percent); + printf(" : "); + color_fprintf(stdout, PERF_COLOR_BLUE, "%s\n", oline->line); + } else { + if (!*oline->line) + printf(" :\n"); + else + printf(" : %s\n", oline->line); + } +} + +static int symbol__parse_objdump_line(struct symbol *sym, struct map *map, FILE *file, + struct list_head *head, size_t privsize) +{ + struct objdump_line *objdump_line; + char *line = NULL, *tmp, *tmp2, *c; + size_t line_len; + s64 line_ip, offset = -1; + + if (getline(&line, &line_len, file) < 0) + return -1; + + if (!line) + return -1; + + while (line_len != 0 && isspace(line[line_len - 1])) + line[--line_len] = '\0'; + + c = strchr(line, '\n'); + if (c) + *c = 0; + + line_ip = -1; + + /* + * Strip leading spaces: + */ + tmp = line; + while (*tmp) { + if (*tmp != ' ') + break; + tmp++; + } + + if (*tmp) { + /* + * Parse hexa addresses followed by ':' + */ + line_ip = strtoull(tmp, &tmp2, 16); + if (*tmp2 != ':' || tmp == tmp2 || tmp2[1] == '\0') + line_ip = -1; + } + + if (line_ip != -1) { + u64 start = map__rip_2objdump(map, sym->start), + end = map__rip_2objdump(map, sym->end); + + offset = line_ip - start; + if (offset < 0 || (u64)line_ip > end) + offset = -1; + } + + objdump_line = objdump_line__new(offset, line, privsize); + if (objdump_line == NULL) { + free(line); + return -1; + } + objdump__add_line(head, objdump_line); + + return 0; +} + +int symbol__annotate(struct symbol *sym, struct map *map, + struct list_head *head, size_t privsize) +{ + struct dso *dso = map->dso; + char *filename = dso__build_id_filename(dso, NULL, 0); + bool free_filename = true; + char command[PATH_MAX * 2]; + FILE *file; + int err = 0; + u64 len; + char symfs_filename[PATH_MAX]; + + if (filename) { + snprintf(symfs_filename, sizeof(symfs_filename), "%s%s", + symbol_conf.symfs, filename); + } + + if (filename == NULL) { + if (dso->has_build_id) { + pr_err("Can't annotate %s: not enough memory\n", + sym->name); + return -ENOMEM; + } + goto fallback; + } else if (readlink(symfs_filename, command, sizeof(command)) < 0 || + strstr(command, "[kernel.kallsyms]") || + access(symfs_filename, R_OK)) { + free(filename); +fallback: + /* + * If we don't have build-ids or the build-id file isn't in the + * cache, or is just a kallsyms file, well, lets hope that this + * DSO is the same as when 'perf record' ran. + */ + filename = dso->long_name; + snprintf(symfs_filename, sizeof(symfs_filename), "%s%s", + symbol_conf.symfs, filename); + free_filename = false; + } + + if (dso->origin == DSO__ORIG_KERNEL) { + if (dso->annotate_warned) + goto out_free_filename; + err = -ENOENT; + dso->annotate_warned = 1; + pr_err("Can't annotate %s: No vmlinux file was found in the " + "path\n", sym->name); + goto out_free_filename; + } + + pr_debug("%s: filename=%s, sym=%s, start=%#" PRIx64 ", end=%#" PRIx64 "\n", __func__, + filename, sym->name, map->unmap_ip(map, sym->start), + map->unmap_ip(map, sym->end)); + + len = sym->end - sym->start; + + pr_debug("annotating [%p] %30s : [%p] %30s\n", + dso, dso->long_name, sym, sym->name); + + snprintf(command, sizeof(command), + "objdump --start-address=0x%016" PRIx64 + " --stop-address=0x%016" PRIx64 " -dS -C %s|grep -v %s|expand", + map__rip_2objdump(map, sym->start), + map__rip_2objdump(map, sym->end), + symfs_filename, filename); + + pr_debug("Executing: %s\n", command); + + file = popen(command, "r"); + if (!file) + goto out_free_filename; + + while (!feof(file)) + if (symbol__parse_objdump_line(sym, map, file, head, privsize) < 0) + break; + + pclose(file); +out_free_filename: + if (free_filename) + free(filename); + return err; +} + +static void insert_source_line(struct rb_root *root, struct source_line *src_line) +{ + struct source_line *iter; + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + + while (*p != NULL) { + parent = *p; + iter = rb_entry(parent, struct source_line, node); + + if (src_line->percent > iter->percent) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + + rb_link_node(&src_line->node, parent, p); + rb_insert_color(&src_line->node, root); +} + +static void symbol__free_source_line(struct symbol *sym, int len) +{ + struct annotation *notes = symbol__annotation(sym); + struct source_line *src_line = notes->src_line; + int i; + + for (i = 0; i < len; i++) + free(src_line[i].path); + + free(src_line); + notes->src_line = NULL; +} + +/* Get the filename:line for the colored entries */ +static int symbol__get_source_line(struct symbol *sym, struct map *map, + struct rb_root *root, int len, + const char *filename) +{ + u64 start; + int i; + char cmd[PATH_MAX * 2]; + struct source_line *src_line; + struct annotation *notes = symbol__annotation(sym); + struct sym_hist *h = notes->histogram; + + if (!h->sum) + return 0; + + src_line = notes->src_line = calloc(len, sizeof(struct source_line)); + if (!notes->src_line) + return -1; + + start = map->unmap_ip(map, sym->start); + + for (i = 0; i < len; i++) { + char *path = NULL; + size_t line_len; + u64 offset; + FILE *fp; + + src_line[i].percent = 100.0 * h->addr[i] / h->sum; + if (src_line[i].percent <= 0.5) + continue; + + offset = start + i; + sprintf(cmd, "addr2line -e %s %016" PRIx64, filename, offset); + fp = popen(cmd, "r"); + if (!fp) + continue; + + if (getline(&path, &line_len, fp) < 0 || !line_len) + goto next; + + src_line[i].path = malloc(sizeof(char) * line_len + 1); + if (!src_line[i].path) + goto next; + + strcpy(src_line[i].path, path); + insert_source_line(root, &src_line[i]); + + next: + pclose(fp); + } + + return 0; +} + +static void print_summary(struct rb_root *root, const char *filename) +{ + struct source_line *src_line; + struct rb_node *node; + + printf("\nSorted summary for file %s\n", filename); + printf("----------------------------------------------\n\n"); + + if (RB_EMPTY_ROOT(root)) { + printf(" Nothing higher than %1.1f%%\n", MIN_GREEN); + return; + } + + node = rb_first(root); + while (node) { + double percent; + const char *color; + char *path; + + src_line = rb_entry(node, struct source_line, node); + percent = src_line->percent; + color = get_percent_color(percent); + path = src_line->path; + + color_fprintf(stdout, color, " %7.2f %s", percent, path); + node = rb_next(node); + } +} + +static void symbol__annotate_hits(struct symbol *sym) +{ + struct annotation *notes = symbol__annotation(sym); + struct sym_hist *h = notes->histogram; + u64 len = sym->end - sym->start, offset; + + for (offset = 0; offset < len; ++offset) + if (h->addr[offset] != 0) + printf("%*" PRIx64 ": %" PRIu64 "\n", BITS_PER_LONG / 2, + sym->start + offset, h->addr[offset]); + printf("%*s: %" PRIu64 "\n", BITS_PER_LONG / 2, "h->sum", h->sum); +} + +int symbol__tty_annotate(struct symbol *sym, struct map *map, bool print_lines, + bool full_paths) +{ + struct dso *dso = map->dso; + const char *filename = dso->long_name, *d_filename; + struct rb_root source_line = RB_ROOT; + struct objdump_line *pos, *n; + LIST_HEAD(head); + u64 len; + + if (symbol__annotate(sym, map, &head, 0) < 0) + return -1; + + if (full_paths) + d_filename = filename; + else + d_filename = basename(filename); + + len = sym->end - sym->start; + + if (print_lines) { + symbol__get_source_line(sym, map, &source_line, len, filename); + print_summary(&source_line, filename); + } + + printf("\n\n------------------------------------------------\n"); + printf(" Percent | Source code & Disassembly of %s\n", d_filename); + printf("------------------------------------------------\n"); + + if (verbose) + symbol__annotate_hits(sym); + + list_for_each_entry_safe(pos, n, &head, node) { + objdump_line__print(pos, &head, sym, len); + list_del(&pos->node); + objdump_line__free(pos); + } + + if (print_lines) + symbol__free_source_line(sym, len); + + return 0; +} diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h new file mode 100644 index 000000000000..6e2fbc205299 --- /dev/null +++ b/tools/perf/util/annotate.h @@ -0,0 +1,65 @@ +#ifndef __PERF_ANNOTATE_H +#define __PERF_ANNOTATE_H + +#include +#include "types.h" +#include "symbol.h" +#include +#include + +struct objdump_line { + struct list_head node; + s64 offset; + char *line; +}; + +void objdump_line__free(struct objdump_line *self); +struct objdump_line *objdump__get_next_ip_line(struct list_head *head, + struct objdump_line *pos); + +struct sym_hist { + u64 sum; + u64 addr[0]; +}; + +struct source_line { + struct rb_node node; + double percent; + char *path; +}; + +struct annotation { + struct sym_hist *histogram; + struct source_line *src_line; +}; + +struct sannotation { + struct annotation annotation; + struct symbol symbol; +}; + +static inline struct annotation *symbol__annotation(struct symbol *sym) +{ + struct sannotation *a = container_of(sym, struct sannotation, symbol); + return &a->annotation; +} + +int symbol__inc_addr_samples(struct symbol *sym, struct map *map, u64 addr); + +int symbol__annotate(struct symbol *sym, struct map *map, + struct list_head *head, size_t privsize); + +int symbol__tty_annotate(struct symbol *sym, struct map *map, + bool print_lines, bool full_paths); + +#ifdef NO_NEWT_SUPPORT +static inline int symbol__tui_annotate(symbol *sym __used, + struct map *map __used) +{ + return 0; +} +#else +int symbol__tui_annotate(struct symbol *sym, struct map *map); +#endif + +#endif /* __PERF_ANNOTATE_H */ diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 95887804dc8e..6d9c92c3d7cb 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -1,3 +1,4 @@ +#include "annotate.h" #include "util.h" #include "build-id.h" #include "hist.h" @@ -949,225 +950,15 @@ void hists__filter_by_thread(struct hists *self, const struct thread *thread) } } -static int symbol__alloc_hist(struct symbol *self) +int hist_entry__inc_addr_samples(struct hist_entry *he, u64 ip) { - struct sym_priv *priv = symbol__priv(self); - const int size = (sizeof(*priv->hist) + - (self->end - self->start) * sizeof(u64)); - - priv->hist = zalloc(size); - return priv->hist == NULL ? -1 : 0; + return symbol__inc_addr_samples(he->ms.sym, he->ms.map, ip); } -int hist_entry__inc_addr_samples(struct hist_entry *self, u64 ip) -{ - unsigned int sym_size, offset; - struct symbol *sym = self->ms.sym; - struct sym_priv *priv; - struct sym_hist *h; - - if (!sym || !self->ms.map) - return 0; - - priv = symbol__priv(sym); - if (priv->hist == NULL && symbol__alloc_hist(sym) < 0) - return -ENOMEM; - - sym_size = sym->end - sym->start; - offset = ip - sym->start; - - pr_debug3("%s: ip=%#" PRIx64 "\n", __func__, self->ms.map->unmap_ip(self->ms.map, ip)); - - if (offset >= sym_size) - return 0; - - h = priv->hist; - h->sum++; - h->ip[offset]++; - - pr_debug3("%#" PRIx64 " %s: period++ [ip: %#" PRIx64 ", %#" PRIx64 - "] => %" PRIu64 "\n", self->ms.sym->start, self->ms.sym->name, - ip, ip - self->ms.sym->start, h->ip[offset]); - return 0; -} - -static struct objdump_line *objdump_line__new(s64 offset, char *line, size_t privsize) -{ - struct objdump_line *self = malloc(sizeof(*self) + privsize); - - if (self != NULL) { - self->offset = offset; - self->line = line; - } - - return self; -} - -void objdump_line__free(struct objdump_line *self) -{ - free(self->line); - free(self); -} - -static void objdump__add_line(struct list_head *head, struct objdump_line *line) -{ - list_add_tail(&line->node, head); -} - -struct objdump_line *objdump__get_next_ip_line(struct list_head *head, - struct objdump_line *pos) -{ - list_for_each_entry_continue(pos, head, node) - if (pos->offset >= 0) - return pos; - - return NULL; -} - -static int hist_entry__parse_objdump_line(struct hist_entry *self, FILE *file, - struct list_head *head, size_t privsize) -{ - struct symbol *sym = self->ms.sym; - struct objdump_line *objdump_line; - char *line = NULL, *tmp, *tmp2, *c; - size_t line_len; - s64 line_ip, offset = -1; - - if (getline(&line, &line_len, file) < 0) - return -1; - - if (!line) - return -1; - - while (line_len != 0 && isspace(line[line_len - 1])) - line[--line_len] = '\0'; - - c = strchr(line, '\n'); - if (c) - *c = 0; - - line_ip = -1; - - /* - * Strip leading spaces: - */ - tmp = line; - while (*tmp) { - if (*tmp != ' ') - break; - tmp++; - } - - if (*tmp) { - /* - * Parse hexa addresses followed by ':' - */ - line_ip = strtoull(tmp, &tmp2, 16); - if (*tmp2 != ':' || tmp == tmp2 || tmp2[1] == '\0') - line_ip = -1; - } - - if (line_ip != -1) { - u64 start = map__rip_2objdump(self->ms.map, sym->start), - end = map__rip_2objdump(self->ms.map, sym->end); - - offset = line_ip - start; - if (offset < 0 || (u64)line_ip > end) - offset = -1; - } - - objdump_line = objdump_line__new(offset, line, privsize); - if (objdump_line == NULL) { - free(line); - return -1; - } - objdump__add_line(head, objdump_line); - - return 0; -} - -int hist_entry__annotate(struct hist_entry *self, struct list_head *head, +int hist_entry__annotate(struct hist_entry *he, struct list_head *head, size_t privsize) { - struct symbol *sym = self->ms.sym; - struct map *map = self->ms.map; - struct dso *dso = map->dso; - char *filename = dso__build_id_filename(dso, NULL, 0); - bool free_filename = true; - char command[PATH_MAX * 2]; - FILE *file; - int err = 0; - u64 len; - char symfs_filename[PATH_MAX]; - - if (filename) { - snprintf(symfs_filename, sizeof(symfs_filename), "%s%s", - symbol_conf.symfs, filename); - } - - if (filename == NULL) { - if (dso->has_build_id) { - pr_err("Can't annotate %s: not enough memory\n", - sym->name); - return -ENOMEM; - } - goto fallback; - } else if (readlink(symfs_filename, command, sizeof(command)) < 0 || - strstr(command, "[kernel.kallsyms]") || - access(symfs_filename, R_OK)) { - free(filename); -fallback: - /* - * If we don't have build-ids or the build-id file isn't in the - * cache, or is just a kallsyms file, well, lets hope that this - * DSO is the same as when 'perf record' ran. - */ - filename = dso->long_name; - snprintf(symfs_filename, sizeof(symfs_filename), "%s%s", - symbol_conf.symfs, filename); - free_filename = false; - } - - if (dso->origin == DSO__ORIG_KERNEL) { - if (dso->annotate_warned) - goto out_free_filename; - err = -ENOENT; - dso->annotate_warned = 1; - pr_err("Can't annotate %s: No vmlinux file was found in the " - "path\n", sym->name); - goto out_free_filename; - } - - pr_debug("%s: filename=%s, sym=%s, start=%#" PRIx64 ", end=%#" PRIx64 "\n", __func__, - filename, sym->name, map->unmap_ip(map, sym->start), - map->unmap_ip(map, sym->end)); - - len = sym->end - sym->start; - - pr_debug("annotating [%p] %30s : [%p] %30s\n", - dso, dso->long_name, sym, sym->name); - - snprintf(command, sizeof(command), - "objdump --start-address=0x%016" PRIx64 " --stop-address=0x%016" PRIx64 " -dS -C %s|grep -v %s|expand", - map__rip_2objdump(map, sym->start), - map__rip_2objdump(map, sym->end), - symfs_filename, filename); - - pr_debug("Executing: %s\n", command); - - file = popen(command, "r"); - if (!file) - goto out_free_filename; - - while (!feof(file)) - if (hist_entry__parse_objdump_line(self, file, head, privsize) < 0) - break; - - pclose(file); -out_free_filename: - if (free_filename) - free(filename); - return err; + return symbol__annotate(he->ms.sym, he->ms.map, head, privsize); } void hists__inc_nr_events(struct hists *self, u32 type) diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 889559b86492..8a201f755534 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -9,33 +9,6 @@ extern struct callchain_param callchain_param; struct hist_entry; struct addr_location; struct symbol; -struct rb_root; - -struct objdump_line { - struct list_head node; - s64 offset; - char *line; -}; - -void objdump_line__free(struct objdump_line *self); -struct objdump_line *objdump__get_next_ip_line(struct list_head *head, - struct objdump_line *pos); - -struct sym_hist { - u64 sum; - u64 ip[0]; -}; - -struct sym_ext { - struct rb_node node; - double percent; - char *path; -}; - -struct sym_priv { - struct sym_hist *hist; - struct sym_ext *ext; -}; /* * The kernel collects the number of events it couldn't send in a stretch and diff --git a/tools/perf/util/ui/browsers/annotate.c b/tools/perf/util/ui/browsers/annotate.c index 82b78f99251b..daa7138d8015 100644 --- a/tools/perf/util/ui/browsers/annotate.c +++ b/tools/perf/util/ui/browsers/annotate.c @@ -1,9 +1,11 @@ #include "../browser.h" #include "../helpline.h" #include "../libslang.h" +#include "../../annotate.h" #include "../../hist.h" #include "../../sort.h" #include "../../symbol.h" +#include "../../annotate.h" static void ui__error_window(const char *fmt, ...) { @@ -66,24 +68,26 @@ static double objdump_line__calc_percent(struct objdump_line *self, if (self->offset != -1) { int len = sym->end - sym->start; unsigned int hits = 0; - struct sym_priv *priv = symbol__priv(sym); - struct sym_ext *sym_ext = priv->ext; - struct sym_hist *h = priv->hist; + struct annotation *notes = symbol__annotation(sym); + struct source_line *src_line = notes->src_line; + struct sym_hist *h = notes->histogram; s64 offset = self->offset; struct objdump_line *next = objdump__get_next_ip_line(head, self); - while (offset < (s64)len && (next == NULL || offset < next->offset)) { - if (sym_ext) { - percent += sym_ext[offset].percent; + if (src_line) { + percent += src_line[offset].percent; } else - hits += h->ip[offset]; + hits += h->addr[offset]; ++offset; } - - if (sym_ext == NULL && h->sum) + /* + * If the percentage wasn't already calculated in + * symbol__get_source_line, do it now: + */ + if (src_line == NULL && h->sum) percent = 100.0 * hits / h->sum; } @@ -136,10 +140,10 @@ static void annotate_browser__set_top(struct annotate_browser *self, static int annotate_browser__run(struct annotate_browser *self) { struct rb_node *nd; - struct hist_entry *he = self->b.priv; + struct symbol *sym = self->b.priv; int key; - if (ui_browser__show(&self->b, he->ms.sym->name, + if (ui_browser__show(&self->b, sym->name, "<-, -> or ESC: exit, TAB/shift+TAB: cycle thru samples") < 0) return -1; /* @@ -179,7 +183,12 @@ out: return key; } -int hist_entry__tui_annotate(struct hist_entry *self) +int hist_entry__tui_annotate(struct hist_entry *he) +{ + return symbol__tui_annotate(he->ms.sym, he->ms.map); +} + +int symbol__tui_annotate(struct symbol *sym, struct map *map) { struct objdump_line *pos, *n; struct objdump_line_rb_node *rbpos; @@ -190,18 +199,18 @@ int hist_entry__tui_annotate(struct hist_entry *self) .refresh = ui_browser__list_head_refresh, .seek = ui_browser__list_head_seek, .write = annotate_browser__write, - .priv = self, + .priv = sym, }, }; int ret; - if (self->ms.sym == NULL) + if (sym == NULL) return -1; - if (self->ms.map->dso->annotate_warned) + if (map->dso->annotate_warned) return -1; - if (hist_entry__annotate(self, &head, sizeof(*rbpos)) < 0) { + if (symbol__annotate(sym, map, &head, sizeof(*rbpos)) < 0) { ui__error_window(ui_helpline__last_msg); return -1; } @@ -214,7 +223,7 @@ int hist_entry__tui_annotate(struct hist_entry *self) browser.b.width = line_len; rbpos = objdump_line__rb(pos); rbpos->idx = browser.b.nr_entries++; - rbpos->percent = objdump_line__calc_percent(pos, &head, self->ms.sym); + rbpos->percent = objdump_line__calc_percent(pos, &head, sym); if (rbpos->percent < 0.01) continue; objdump__insert_line(&browser.entries, rbpos); From 2f525d0148ef2734c8a172201e5e1e9167a8a5fd Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 4 Feb 2011 13:43:24 -0200 Subject: [PATCH 172/706] perf annotate: Support multiple histograms in annotation The perf annotate tool continues aggregating everything on just one histograms, but to support the top model add support for one histogram perf evsel in the evlist. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 29 +++++++++---- tools/perf/builtin-report.c | 15 +++++-- tools/perf/util/annotate.c | 57 ++++++++++++-------------- tools/perf/util/annotate.h | 29 ++++++++++--- tools/perf/util/hist.c | 4 +- tools/perf/util/hist.h | 16 ++++---- tools/perf/util/ui/browsers/annotate.c | 12 +++--- tools/perf/util/ui/browsers/hists.c | 9 ++-- 8 files changed, 106 insertions(+), 65 deletions(-) diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 9072ef44cfcb..f3e44231b10d 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -57,7 +57,18 @@ static int hists__add_entry(struct hists *self, struct addr_location *al) if (he == NULL) return -ENOMEM; - return hist_entry__inc_addr_samples(he, al->addr); + if (he->ms.sym != NULL) { + /* + * All aggregated on the first sym_hist. + */ + struct annotation *notes = symbol__annotation(he->ms.sym); + if (notes->histograms == NULL && symbol__alloc_hist(he->ms.sym, 1) < 0) + return -ENOMEM; + + return hist_entry__inc_addr_samples(he, 0, al->addr); + } + + return 0; } static int process_sample_event(union perf_event *event, @@ -81,9 +92,9 @@ static int process_sample_event(union perf_event *event, return 0; } -static int hist_entry__tty_annotate(struct hist_entry *he) +static int hist_entry__tty_annotate(struct hist_entry *he, int evidx) { - return symbol__tty_annotate(he->ms.sym, he->ms.map, + return symbol__tty_annotate(he->ms.sym, he->ms.map, evidx, print_line, full_paths); } @@ -100,7 +111,7 @@ static void hists__find_annotations(struct hists *self) goto find_next; notes = symbol__annotation(he->ms.sym); - if (notes->histogram == NULL) { + if (notes->histograms == NULL) { find_next: if (key == KEY_LEFT) nd = rb_prev(nd); @@ -110,7 +121,8 @@ find_next: } if (use_browser > 0) { - key = hist_entry__tui_annotate(he); + /* For now all is aggregated on the first */ + key = hist_entry__tui_annotate(he, 0); switch (key) { case KEY_RIGHT: next = rb_next(nd); @@ -125,15 +137,16 @@ find_next: if (next != NULL) nd = next; } else { - hist_entry__tty_annotate(he); + /* For now all is aggregated on the first */ + hist_entry__tty_annotate(he, 0); nd = rb_next(nd); /* * Since we have a hist_entry per IP for the same * symbol, free he->ms.sym->histogram to signal we already * processed this symbol. */ - free(notes->histogram); - notes->histogram = NULL; + free(notes->histograms); + notes->histograms = NULL; } } } diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 91e4cdba933b..de06bf55efff 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -118,8 +118,17 @@ static int perf_session__add_hist_entry(struct perf_session *session, * so we don't allocated the extra space needed because the stdio * code will not use it. */ - if (use_browser > 0) - err = hist_entry__inc_addr_samples(he, al->addr); + if (al->sym != NULL && use_browser > 0) { + /* + * All aggregated on the first sym_hist. + */ + struct annotation *notes = symbol__annotation(he->ms.sym); + if (notes->histograms == NULL && + symbol__alloc_hist(he->ms.sym, 1) < 0) + err = -ENOMEM; + else + err = hist_entry__inc_addr_samples(he, 0, al->addr); + } return err; } @@ -349,7 +358,7 @@ static int __cmd_report(void) } if (use_browser > 0) - hists__tui_browse_tree(&session->hists_tree, help); + hists__tui_browse_tree(&session->hists_tree, help, 0); else hists__tty_browse_tree(&session->hists_tree, help); diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 9b25575b980c..7488fe99502c 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -15,44 +15,40 @@ #include "debug.h" #include "annotate.h" -static int symbol__alloc_hist(struct symbol *sym) +int symbol__alloc_hist(struct symbol *sym, int nevents) { struct annotation *notes = symbol__annotation(sym); - const int size = (sizeof(*notes->histogram) + - (sym->end - sym->start) * sizeof(u64)); - notes->histogram = zalloc(size); - return notes->histogram == NULL ? -1 : 0; + notes->sizeof_sym_hist = (sizeof(*notes->histograms) + + (sym->end - sym->start) * sizeof(u64)); + notes->histograms = calloc(nevents, notes->sizeof_sym_hist); + return notes->histograms == NULL ? -1 : 0; } -int symbol__inc_addr_samples(struct symbol *sym, struct map *map, u64 addr) +int symbol__inc_addr_samples(struct symbol *sym, struct map *map, + int evidx, u64 addr) { - unsigned int sym_size, offset; + unsigned offset; struct annotation *notes; struct sym_hist *h; - if (!sym || !map) - return 0; - notes = symbol__annotation(sym); - if (notes->histogram == NULL && symbol__alloc_hist(sym) < 0) + if (notes->histograms == NULL) return -ENOMEM; - sym_size = sym->end - sym->start; - offset = addr - sym->start; - pr_debug3("%s: addr=%#" PRIx64 "\n", __func__, map->unmap_ip(map, addr)); - if (offset >= sym_size) + if (addr >= sym->end) return 0; - h = notes->histogram; + offset = addr - sym->start; + h = annotation__histogram(notes, evidx); h->sum++; h->addr[offset]++; pr_debug3("%#" PRIx64 " %s: period++ [addr: %#" PRIx64 ", %#" PRIx64 - "] => %" PRIu64 "\n", sym->start, sym->name, - addr, addr - sym->start, h->addr[offset]); + ", evidx=%d] => %" PRIu64 "\n", sym->start, sym->name, + addr, addr - sym->start, evidx, h->addr[offset]); return 0; } @@ -90,8 +86,8 @@ struct objdump_line *objdump__get_next_ip_line(struct list_head *head, } static void objdump_line__print(struct objdump_line *oline, - struct list_head *head, - struct symbol *sym, u64 len) + struct list_head *head, struct symbol *sym, + int evidx, u64 len) { static const char *prev_line; static const char *prev_color; @@ -103,7 +99,7 @@ static void objdump_line__print(struct objdump_line *oline, const char *color; struct annotation *notes = symbol__annotation(sym); struct source_line *src_line = notes->src_line; - struct sym_hist *h = notes->histogram; + struct sym_hist *h = annotation__histogram(notes, evidx); s64 offset = oline->offset; struct objdump_line *next = objdump__get_next_ip_line(head, oline); @@ -328,7 +324,7 @@ static void symbol__free_source_line(struct symbol *sym, int len) /* Get the filename:line for the colored entries */ static int symbol__get_source_line(struct symbol *sym, struct map *map, - struct rb_root *root, int len, + int evidx, struct rb_root *root, int len, const char *filename) { u64 start; @@ -336,7 +332,7 @@ static int symbol__get_source_line(struct symbol *sym, struct map *map, char cmd[PATH_MAX * 2]; struct source_line *src_line; struct annotation *notes = symbol__annotation(sym); - struct sym_hist *h = notes->histogram; + struct sym_hist *h = annotation__histogram(notes, evidx); if (!h->sum) return 0; @@ -409,10 +405,10 @@ static void print_summary(struct rb_root *root, const char *filename) } } -static void symbol__annotate_hits(struct symbol *sym) +static void symbol__annotate_hits(struct symbol *sym, int evidx) { struct annotation *notes = symbol__annotation(sym); - struct sym_hist *h = notes->histogram; + struct sym_hist *h = annotation__histogram(notes, evidx); u64 len = sym->end - sym->start, offset; for (offset = 0; offset < len; ++offset) @@ -422,8 +418,8 @@ static void symbol__annotate_hits(struct symbol *sym) printf("%*s: %" PRIu64 "\n", BITS_PER_LONG / 2, "h->sum", h->sum); } -int symbol__tty_annotate(struct symbol *sym, struct map *map, bool print_lines, - bool full_paths) +int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx, + bool print_lines, bool full_paths) { struct dso *dso = map->dso; const char *filename = dso->long_name, *d_filename; @@ -443,7 +439,8 @@ int symbol__tty_annotate(struct symbol *sym, struct map *map, bool print_lines, len = sym->end - sym->start; if (print_lines) { - symbol__get_source_line(sym, map, &source_line, len, filename); + symbol__get_source_line(sym, map, evidx, &source_line, + len, filename); print_summary(&source_line, filename); } @@ -452,10 +449,10 @@ int symbol__tty_annotate(struct symbol *sym, struct map *map, bool print_lines, printf("------------------------------------------------\n"); if (verbose) - symbol__annotate_hits(sym); + symbol__annotate_hits(sym, evidx); list_for_each_entry_safe(pos, n, &head, node) { - objdump_line__print(pos, &head, sym, len); + objdump_line__print(pos, &head, sym, evidx, len); list_del(&pos->node); objdump_line__free(pos); } diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 6e2fbc205299..0a5069ca6dd7 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -28,9 +28,21 @@ struct source_line { char *path; }; +/** struct annotation - symbols with hits have this attached as in sannotation + * + * @histogram: Array of addr hit histograms per event being monitored + * @src_line: If 'print_lines' is specified, per source code line percentages + * + * src_line is allocated, percentages calculated and all sorted by percentage + * when the annotation is about to be presented, so the percentages are for + * one of the entries in the histogram array, i.e. for the event/counter being + * presented. It is deallocated right after symbol__{tui,tty,etc}_annotate + * returns. + */ struct annotation { - struct sym_hist *histogram; struct source_line *src_line; + struct sym_hist *histograms; + int sizeof_sym_hist; }; struct sannotation { @@ -38,28 +50,35 @@ struct sannotation { struct symbol symbol; }; +static inline struct sym_hist *annotation__histogram(struct annotation *notes, int idx) +{ + return ((void *)notes->histograms) + (notes->sizeof_sym_hist * idx); +} + static inline struct annotation *symbol__annotation(struct symbol *sym) { struct sannotation *a = container_of(sym, struct sannotation, symbol); return &a->annotation; } -int symbol__inc_addr_samples(struct symbol *sym, struct map *map, u64 addr); +int symbol__inc_addr_samples(struct symbol *sym, struct map *map, + int evidx, u64 addr); +int symbol__alloc_hist(struct symbol *sym, int nevents); int symbol__annotate(struct symbol *sym, struct map *map, struct list_head *head, size_t privsize); -int symbol__tty_annotate(struct symbol *sym, struct map *map, +int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx, bool print_lines, bool full_paths); #ifdef NO_NEWT_SUPPORT static inline int symbol__tui_annotate(symbol *sym __used, - struct map *map __used) + struct map *map __used, int evidx __used) { return 0; } #else -int symbol__tui_annotate(struct symbol *sym, struct map *map); +int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx); #endif #endif /* __PERF_ANNOTATE_H */ diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 6d9c92c3d7cb..bac5ab684967 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -950,9 +950,9 @@ void hists__filter_by_thread(struct hists *self, const struct thread *thread) } } -int hist_entry__inc_addr_samples(struct hist_entry *he, u64 ip) +int hist_entry__inc_addr_samples(struct hist_entry *he, int evidx, u64 ip) { - return symbol__inc_addr_samples(he->ms.sym, he->ms.map, ip); + return symbol__inc_addr_samples(he->ms.sym, he->ms.map, evidx, ip); } int hist_entry__annotate(struct hist_entry *he, struct list_head *head, diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 8a201f755534..2c6cdae6a764 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -77,7 +77,7 @@ size_t hists__fprintf_nr_events(struct hists *self, FILE *fp); size_t hists__fprintf(struct hists *self, struct hists *pair, bool show_displacement, FILE *fp); -int hist_entry__inc_addr_samples(struct hist_entry *self, u64 ip); +int hist_entry__inc_addr_samples(struct hist_entry *self, int evidx, u64 addr); int hist_entry__annotate(struct hist_entry *self, struct list_head *head, size_t privsize); @@ -91,18 +91,20 @@ bool hists__new_col_len(struct hists *self, enum hist_column col, u16 len); #ifdef NO_NEWT_SUPPORT static inline int hists__browse(struct hists *self __used, const char *helpline __used, - const char *ev_name __used) + const char *ev_name __used, int evidx __used) { return 0; } static inline int hists__tui_browse_tree(struct rb_root *self __used, - const char *help __used) + const char *help __used, + int evidx __used) { return 0; } -static inline int hist_entry__tui_annotate(struct hist_entry *self __used) +static inline int hist_entry__tui_annotate(struct hist_entry *self __used, + int evidx __used) { return 0; } @@ -111,13 +113,13 @@ static inline int hist_entry__tui_annotate(struct hist_entry *self __used) #else #include int hists__browse(struct hists *self, const char *helpline, - const char *ev_name); -int hist_entry__tui_annotate(struct hist_entry *self); + const char *ev_name, int evidx); +int hist_entry__tui_annotate(struct hist_entry *self, int evidx); #define KEY_LEFT NEWT_KEY_LEFT #define KEY_RIGHT NEWT_KEY_RIGHT -int hists__tui_browse_tree(struct rb_root *self, const char *help); +int hists__tui_browse_tree(struct rb_root *self, const char *help, int evidx); #endif unsigned int hists__sort_list_width(struct hists *self); diff --git a/tools/perf/util/ui/browsers/annotate.c b/tools/perf/util/ui/browsers/annotate.c index daa7138d8015..8d8a16895af7 100644 --- a/tools/perf/util/ui/browsers/annotate.c +++ b/tools/perf/util/ui/browsers/annotate.c @@ -61,7 +61,7 @@ static void annotate_browser__write(struct ui_browser *self, void *entry, int ro static double objdump_line__calc_percent(struct objdump_line *self, struct list_head *head, - struct symbol *sym) + struct symbol *sym, int evidx) { double percent = 0.0; @@ -70,7 +70,7 @@ static double objdump_line__calc_percent(struct objdump_line *self, unsigned int hits = 0; struct annotation *notes = symbol__annotation(sym); struct source_line *src_line = notes->src_line; - struct sym_hist *h = notes->histogram; + struct sym_hist *h = annotation__histogram(notes, evidx); s64 offset = self->offset; struct objdump_line *next = objdump__get_next_ip_line(head, self); @@ -183,12 +183,12 @@ out: return key; } -int hist_entry__tui_annotate(struct hist_entry *he) +int hist_entry__tui_annotate(struct hist_entry *he, int evidx) { - return symbol__tui_annotate(he->ms.sym, he->ms.map); + return symbol__tui_annotate(he->ms.sym, he->ms.map, evidx); } -int symbol__tui_annotate(struct symbol *sym, struct map *map) +int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx) { struct objdump_line *pos, *n; struct objdump_line_rb_node *rbpos; @@ -223,7 +223,7 @@ int symbol__tui_annotate(struct symbol *sym, struct map *map) browser.b.width = line_len; rbpos = objdump_line__rb(pos); rbpos->idx = browser.b.nr_entries++; - rbpos->percent = objdump_line__calc_percent(pos, &head, sym); + rbpos->percent = objdump_line__calc_percent(pos, &head, sym, evidx); if (rbpos->percent < 0.01) continue; objdump__insert_line(&browser.entries, rbpos); diff --git a/tools/perf/util/ui/browsers/hists.c b/tools/perf/util/ui/browsers/hists.c index 86428239fa65..294b49538522 100644 --- a/tools/perf/util/ui/browsers/hists.c +++ b/tools/perf/util/ui/browsers/hists.c @@ -797,7 +797,8 @@ static int hists__browser_title(struct hists *self, char *bf, size_t size, return printed; } -int hists__browse(struct hists *self, const char *helpline, const char *ev_name) +int hists__browse(struct hists *self, const char *helpline, + const char *ev_name, int evidx) { struct hist_browser *browser = hist_browser__new(self); struct pstack *fstack; @@ -935,7 +936,7 @@ do_annotate: if (he == NULL) continue; - hist_entry__tui_annotate(he); + hist_entry__tui_annotate(he, evidx); } else if (choice == browse_map) map__browse(browser->selection->map); else if (choice == zoom_dso) { @@ -984,7 +985,7 @@ out: return key; } -int hists__tui_browse_tree(struct rb_root *self, const char *help) +int hists__tui_browse_tree(struct rb_root *self, const char *help, int evidx) { struct rb_node *first = rb_first(self), *nd = first, *next; int key = 0; @@ -993,7 +994,7 @@ int hists__tui_browse_tree(struct rb_root *self, const char *help) struct hists *hists = rb_entry(nd, struct hists, rb_node); const char *ev_name = __event_name(hists->type, hists->config); - key = hists__browse(hists, help, ev_name); + key = hists__browse(hists, help, ev_name, evidx); switch (key) { case NEWT_KEY_TAB: next = rb_next(nd); From d040bd363824f9f0ad6610b91ee6c65f292c066c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 5 Feb 2011 15:37:31 -0200 Subject: [PATCH 173/706] perf annotate: Config options for symbol__tty_annotate Max line# that should be printed, minimum percentage filter, just like 'perf top', alas, due to it :-) Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 2 +- tools/perf/util/annotate.c | 14 ++++++++++---- tools/perf/util/annotate.h | 3 ++- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index f3e44231b10d..ea6a1165956f 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -95,7 +95,7 @@ static int process_sample_event(union perf_event *event, static int hist_entry__tty_annotate(struct hist_entry *he, int evidx) { return symbol__tty_annotate(he->ms.sym, he->ms.map, evidx, - print_line, full_paths); + print_line, full_paths, 0, 0); } static void hists__find_annotations(struct hists *self) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 7488fe99502c..072bc8d91aa1 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -87,7 +87,7 @@ struct objdump_line *objdump__get_next_ip_line(struct list_head *head, static void objdump_line__print(struct objdump_line *oline, struct list_head *head, struct symbol *sym, - int evidx, u64 len) + int evidx, u64 len, int min_pcnt) { static const char *prev_line; static const char *prev_color; @@ -118,6 +118,9 @@ static void objdump_line__print(struct objdump_line *oline, if (src_line == NULL && h->sum) percent = 100.0 * hits / h->sum; + if (percent < min_pcnt) + return; + color = get_percent_color(percent); /* @@ -419,13 +422,15 @@ static void symbol__annotate_hits(struct symbol *sym, int evidx) } int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx, - bool print_lines, bool full_paths) + bool print_lines, bool full_paths, int min_pcnt, + int max_lines) { struct dso *dso = map->dso; const char *filename = dso->long_name, *d_filename; struct rb_root source_line = RB_ROOT; struct objdump_line *pos, *n; LIST_HEAD(head); + int printed = 2; u64 len; if (symbol__annotate(sym, map, &head, 0) < 0) @@ -444,7 +449,6 @@ int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx, print_summary(&source_line, filename); } - printf("\n\n------------------------------------------------\n"); printf(" Percent | Source code & Disassembly of %s\n", d_filename); printf("------------------------------------------------\n"); @@ -452,9 +456,11 @@ int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx, symbol__annotate_hits(sym, evidx); list_for_each_entry_safe(pos, n, &head, node) { - objdump_line__print(pos, &head, sym, evidx, len); + objdump_line__print(pos, &head, sym, evidx, len, min_pcnt); list_del(&pos->node); objdump_line__free(pos); + if (max_lines && ++printed >= max_lines) + break; } if (print_lines) diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 0a5069ca6dd7..6b707324e66f 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -69,7 +69,8 @@ int symbol__annotate(struct symbol *sym, struct map *map, struct list_head *head, size_t privsize); int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx, - bool print_lines, bool full_paths); + bool print_lines, bool full_paths, int min_pcnt, + int max_lines); #ifdef NO_NEWT_SUPPORT static inline int symbol__tui_annotate(symbol *sym __used, From f1e2701de02cff6d988b1dd49960620d5720cb89 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 5 Feb 2011 18:51:38 -0200 Subject: [PATCH 174/706] perf annotate: Separate objdump parsing from actual screen rendering Because in 'perf top' we'll need to parse just once and then, as samples come, render multiple times with evolving counter values. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 70 +++++++++++++++++++++++++------------- tools/perf/util/annotate.h | 4 +++ 2 files changed, 51 insertions(+), 23 deletions(-) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 072bc8d91aa1..10cdbad76058 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -421,21 +421,16 @@ static void symbol__annotate_hits(struct symbol *sym, int evidx) printf("%*s: %" PRIu64 "\n", BITS_PER_LONG / 2, "h->sum", h->sum); } -int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx, - bool print_lines, bool full_paths, int min_pcnt, - int max_lines) +void symbol__annotate_printf(struct symbol *sym, struct map *map, + struct list_head *head, int evidx, bool full_paths, + int min_pcnt, int max_lines) { struct dso *dso = map->dso; const char *filename = dso->long_name, *d_filename; - struct rb_root source_line = RB_ROOT; - struct objdump_line *pos, *n; - LIST_HEAD(head); + struct objdump_line *pos; int printed = 2; u64 len; - if (symbol__annotate(sym, map, &head, 0) < 0) - return -1; - if (full_paths) d_filename = filename; else @@ -443,28 +438,57 @@ int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx, len = sym->end - sym->start; + printf(" Percent | Source code & Disassembly of %s\n", d_filename); + printf("------------------------------------------------\n"); + + if (verbose) + symbol__annotate_hits(sym, evidx); + + list_for_each_entry(pos, head, node) { + objdump_line__print(pos, head, sym, evidx, len, min_pcnt); + if (max_lines && ++printed >= max_lines) + break; + + } +} + +void objdump_line_list__purge(struct list_head *head) +{ + struct objdump_line *pos, *n; + + list_for_each_entry_safe(pos, n, head, node) { + list_del(&pos->node); + objdump_line__free(pos); + } +} + +int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx, + bool print_lines, bool full_paths, int min_pcnt, + int max_lines) +{ + struct dso *dso = map->dso; + const char *filename = dso->long_name; + struct rb_root source_line = RB_ROOT; + LIST_HEAD(head); + u64 len; + + if (symbol__annotate(sym, map, &head, 0) < 0) + return -1; + + len = sym->end - sym->start; + if (print_lines) { symbol__get_source_line(sym, map, evidx, &source_line, len, filename); print_summary(&source_line, filename); } - printf(" Percent | Source code & Disassembly of %s\n", d_filename); - printf("------------------------------------------------\n"); - - if (verbose) - symbol__annotate_hits(sym, evidx); - - list_for_each_entry_safe(pos, n, &head, node) { - objdump_line__print(pos, &head, sym, evidx, len, min_pcnt); - list_del(&pos->node); - objdump_line__free(pos); - if (max_lines && ++printed >= max_lines) - break; - } - + symbol__annotate_printf(sym, map, &head, evidx, full_paths, + min_pcnt, max_lines); if (print_lines) symbol__free_source_line(sym, len); + objdump_line_list__purge(&head); + return 0; } diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 6b707324e66f..53dd92de2615 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -67,6 +67,10 @@ int symbol__alloc_hist(struct symbol *sym, int nevents); int symbol__annotate(struct symbol *sym, struct map *map, struct list_head *head, size_t privsize); +void symbol__annotate_printf(struct symbol *sym, struct map *map, + struct list_head *head, int evidx, bool full_paths, + int min_pcnt, int max_lines); +void objdump_line_list__purge(struct list_head *head); int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx, bool print_lines, bool full_paths, int min_pcnt, From 36532461a0f60bb36c5470a0326f7394f19db23c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 6 Feb 2011 14:54:44 -0200 Subject: [PATCH 175/706] perf top: Ditch private annotation code, share perf annotate's Next step: Live TUI annotation in perf top, just press enter on a symbol line. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-top.c | 181 +++++++------------------------------ tools/perf/util/annotate.c | 76 ++++++++++++++-- tools/perf/util/annotate.h | 11 ++- tools/perf/util/top.h | 11 +-- 4 files changed, 106 insertions(+), 173 deletions(-) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 154e088588bc..716118a3b3e4 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -20,6 +20,7 @@ #include "perf.h" +#include "util/annotate.h" #include "util/cache.h" #include "util/color.h" #include "util/evlist.h" @@ -140,10 +141,7 @@ static int parse_source(struct sym_entry *syme) struct symbol *sym; struct sym_entry_source *source; struct map *map; - FILE *file; - char command[PATH_MAX*2]; - const char *path; - u64 len; + int err = -1; if (!syme) return -1; @@ -162,197 +160,80 @@ static int parse_source(struct sym_entry *syme) if (syme->src == NULL) return -1; pthread_mutex_init(&syme->src->lock, NULL); + INIT_LIST_HEAD(&syme->src->head); } source = syme->src; - if (source->lines) { + if (symbol__annotation(sym)->histograms != NULL) { pthread_mutex_lock(&source->lock); goto out_assign; } - path = map->dso->long_name; - - len = sym->end - sym->start; - - sprintf(command, - "objdump --start-address=%#0*" PRIx64 " --stop-address=%#0*" PRIx64 " -dS %s", - BITS_PER_LONG / 4, map__rip_2objdump(map, sym->start), - BITS_PER_LONG / 4, map__rip_2objdump(map, sym->end), path); - - file = popen(command, "r"); - if (!file) - return -1; pthread_mutex_lock(&source->lock); - source->lines_tail = &source->lines; - while (!feof(file)) { - struct source_line *src; - size_t dummy = 0; - char *c, *sep; - src = malloc(sizeof(struct source_line)); - assert(src != NULL); - memset(src, 0, sizeof(struct source_line)); - - if (getline(&src->line, &dummy, file) < 0) - break; - if (!src->line) - break; - - c = strchr(src->line, '\n'); - if (c) - *c = 0; - - src->next = NULL; - *source->lines_tail = src; - source->lines_tail = &src->next; - - src->eip = strtoull(src->line, &sep, 16); - if (*sep == ':') - src->eip = map__objdump_2ip(map, src->eip); - else /* this line has no ip info (e.g. source line) */ - src->eip = 0; + if (symbol__alloc_hist(sym, top.evlist->nr_entries) < 0) { + pr_err("Not enough memory for annotating '%s' symbol!\n", + sym->name); + goto out_unlock; } - pclose(file); + + err = symbol__annotate(sym, syme->map, &source->head, 0); + if (err == 0) { out_assign: sym_filter_entry = syme; + } +out_unlock: pthread_mutex_unlock(&source->lock); - return 0; + return err; } static void __zero_source_counters(struct sym_entry *syme) { - int i; - struct source_line *line; - - line = syme->src->lines; - while (line) { - for (i = 0; i < top.evlist->nr_entries; i++) - line->count[i] = 0; - line = line->next; - } + struct symbol *sym = sym_entry__symbol(syme); + symbol__annotate_zero_histograms(sym); } static void record_precise_ip(struct sym_entry *syme, int counter, u64 ip) { - struct source_line *line; - if (syme != sym_filter_entry) return; if (pthread_mutex_trylock(&syme->src->lock)) return; - if (syme->src == NULL || syme->src->source == NULL) - goto out_unlock; + ip = syme->map->map_ip(syme->map, ip); + symbol__inc_addr_samples(sym_entry__symbol(syme), syme->map, counter, ip); - for (line = syme->src->lines; line; line = line->next) { - /* skip lines without IP info */ - if (line->eip == 0) - continue; - if (line->eip == ip) { - line->count[counter]++; - break; - } - if (line->eip > ip) - break; - } -out_unlock: pthread_mutex_unlock(&syme->src->lock); } -#define PATTERN_LEN (BITS_PER_LONG / 4 + 2) - -static void lookup_sym_source(struct sym_entry *syme) -{ - struct symbol *symbol = sym_entry__symbol(syme); - struct source_line *line; - char pattern[PATTERN_LEN + 1]; - - sprintf(pattern, "%0*" PRIx64 " <", BITS_PER_LONG / 4, - map__rip_2objdump(syme->map, symbol->start)); - - pthread_mutex_lock(&syme->src->lock); - for (line = syme->src->lines; line; line = line->next) { - if (memcmp(line->line, pattern, PATTERN_LEN) == 0) { - syme->src->source = line; - break; - } - } - pthread_mutex_unlock(&syme->src->lock); -} - -static void show_lines(struct source_line *queue, int count, int total) -{ - int i; - struct source_line *line; - - line = queue; - for (i = 0; i < count; i++) { - float pcnt = 100.0*(float)line->count[top.sym_counter]/(float)total; - - printf("%8li %4.1f%%\t%s\n", line->count[top.sym_counter], pcnt, line->line); - line = line->next; - } -} - -#define TRACE_COUNT 3 - static void show_details(struct sym_entry *syme) { struct symbol *symbol; - struct source_line *line; - struct source_line *line_queue = NULL; - int displayed = 0; - int line_queue_count = 0, total = 0, more = 0; + int more; if (!syme) return; - if (!syme->src->source) - lookup_sym_source(syme); - - if (!syme->src->source) + symbol = sym_entry__symbol(syme); + if (!syme->src || symbol__annotation(symbol)->histograms == NULL) return; - symbol = sym_entry__symbol(syme); printf("Showing %s for %s\n", event_name(top.sym_evsel), symbol->name); printf(" Events Pcnt (>=%d%%)\n", sym_pcnt_filter); pthread_mutex_lock(&syme->src->lock); - line = syme->src->source; - while (line) { - total += line->count[top.sym_counter]; - line = line->next; - } - - line = syme->src->source; - while (line) { - float pcnt = 0.0; - - if (!line_queue_count) - line_queue = line; - line_queue_count++; - - if (line->count[top.sym_counter]) - pcnt = 100.0 * line->count[top.sym_counter] / (float)total; - if (pcnt >= (float)sym_pcnt_filter) { - if (displayed <= top.print_entries) - show_lines(line_queue, line_queue_count, total); - else more++; - displayed += line_queue_count; - line_queue_count = 0; - line_queue = NULL; - } else if (line_queue_count > TRACE_COUNT) { - line_queue = line_queue->next; - line_queue_count--; - } - - line->count[top.sym_counter] = top.zero ? 0 : line->count[top.sym_counter] * 7 / 8; - line = line->next; - } + more = symbol__annotate_printf(symbol, syme->map, &syme->src->head, + top.sym_evsel->idx, 0, sym_pcnt_filter, + top.print_entries); + if (top.zero) + symbol__annotate_zero_histogram(symbol, top.sym_evsel->idx); + else + symbol__annotate_decay_histogram(symbol, &syme->src->head, + top.sym_evsel->idx); pthread_mutex_unlock(&syme->src->lock); - if (more) + if (more != 0) printf("%d lines not displayed, maybe increase display entries [e]\n", more); } @@ -1172,7 +1053,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) top.sym_evsel = list_entry(top.evlist->entries.next, struct perf_evsel, node); - symbol_conf.priv_size = (sizeof(struct sym_entry) + + symbol_conf.priv_size = (sizeof(struct sym_entry) + sizeof(struct annotation) + (top.evlist->nr_entries + 1) * sizeof(unsigned long)); symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL); diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 10cdbad76058..297337649c21 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -22,9 +22,19 @@ int symbol__alloc_hist(struct symbol *sym, int nevents) notes->sizeof_sym_hist = (sizeof(*notes->histograms) + (sym->end - sym->start) * sizeof(u64)); notes->histograms = calloc(nevents, notes->sizeof_sym_hist); + notes->nr_histograms = nevents; return notes->histograms == NULL ? -1 : 0; } +void symbol__annotate_zero_histograms(struct symbol *sym) +{ + struct annotation *notes = symbol__annotation(sym); + + if (notes->histograms != NULL) + memset(notes->histograms, 0, + notes->nr_histograms * notes->sizeof_sym_hist); +} + int symbol__inc_addr_samples(struct symbol *sym, struct map *map, int evidx, u64 addr) { @@ -85,9 +95,10 @@ struct objdump_line *objdump__get_next_ip_line(struct list_head *head, return NULL; } -static void objdump_line__print(struct objdump_line *oline, - struct list_head *head, struct symbol *sym, - int evidx, u64 len, int min_pcnt) +static int objdump_line__print(struct objdump_line *oline, + struct list_head *head, struct symbol *sym, + int evidx, u64 len, int min_pcnt, + int printed, int max_lines) { static const char *prev_line; static const char *prev_color; @@ -119,7 +130,10 @@ static void objdump_line__print(struct objdump_line *oline, percent = 100.0 * hits / h->sum; if (percent < min_pcnt) - return; + return -1; + + if (printed >= max_lines) + return 1; color = get_percent_color(percent); @@ -140,12 +154,16 @@ static void objdump_line__print(struct objdump_line *oline, color_fprintf(stdout, color, " %7.2f", percent); printf(" : "); color_fprintf(stdout, PERF_COLOR_BLUE, "%s\n", oline->line); - } else { + } else if (printed >= max_lines) + return 1; + else { if (!*oline->line) printf(" :\n"); else printf(" : %s\n", oline->line); } + + return 0; } static int symbol__parse_objdump_line(struct symbol *sym, struct map *map, FILE *file, @@ -421,14 +439,15 @@ static void symbol__annotate_hits(struct symbol *sym, int evidx) printf("%*s: %" PRIu64 "\n", BITS_PER_LONG / 2, "h->sum", h->sum); } -void symbol__annotate_printf(struct symbol *sym, struct map *map, - struct list_head *head, int evidx, bool full_paths, - int min_pcnt, int max_lines) +int symbol__annotate_printf(struct symbol *sym, struct map *map, + struct list_head *head, int evidx, bool full_paths, + int min_pcnt, int max_lines) { struct dso *dso = map->dso; const char *filename = dso->long_name, *d_filename; struct objdump_line *pos; int printed = 2; + int more = 0; u64 len; if (full_paths) @@ -445,10 +464,47 @@ void symbol__annotate_printf(struct symbol *sym, struct map *map, symbol__annotate_hits(sym, evidx); list_for_each_entry(pos, head, node) { - objdump_line__print(pos, head, sym, evidx, len, min_pcnt); - if (max_lines && ++printed >= max_lines) + switch (objdump_line__print(pos, head, sym, evidx, len, min_pcnt, + printed, max_lines)) { + case 0: + ++printed; break; + case 1: + /* filtered by max_lines */ + ++more; + break; + case -1: + default: + /* filtered by min_pcnt */ + break; + } + } + return more; +} + +void symbol__annotate_zero_histogram(struct symbol *sym, int evidx) +{ + struct annotation *notes = symbol__annotation(sym); + struct sym_hist *h = annotation__histogram(notes, evidx); + + memset(h, 0, notes->sizeof_sym_hist); +} + +void symbol__annotate_decay_histogram(struct symbol *sym, + struct list_head *head, int evidx) +{ + struct annotation *notes = symbol__annotation(sym); + struct sym_hist *h = annotation__histogram(notes, evidx); + struct objdump_line *pos; + + h->sum = 0; + + list_for_each_entry(pos, head, node) { + if (pos->offset != -1) { + h->addr[pos->offset] = h->addr[pos->offset] * 7 / 8; + h->sum += h->addr[pos->offset]; + } } } diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 53dd92de2615..b1253aadf340 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -42,6 +42,7 @@ struct source_line { struct annotation { struct source_line *src_line; struct sym_hist *histograms; + int nr_histograms; int sizeof_sym_hist; }; @@ -64,12 +65,16 @@ static inline struct annotation *symbol__annotation(struct symbol *sym) int symbol__inc_addr_samples(struct symbol *sym, struct map *map, int evidx, u64 addr); int symbol__alloc_hist(struct symbol *sym, int nevents); +void symbol__annotate_zero_histograms(struct symbol *sym); int symbol__annotate(struct symbol *sym, struct map *map, struct list_head *head, size_t privsize); -void symbol__annotate_printf(struct symbol *sym, struct map *map, - struct list_head *head, int evidx, bool full_paths, - int min_pcnt, int max_lines); +int symbol__annotate_printf(struct symbol *sym, struct map *map, + struct list_head *head, int evidx, bool full_paths, + int min_pcnt, int max_lines); +void symbol__annotate_zero_histogram(struct symbol *sym, int evidx); +void symbol__annotate_decay_histogram(struct symbol *sym, + struct list_head *head, int evidx); void objdump_line_list__purge(struct list_head *head); int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx, diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h index 500950818d2f..fe44afb69985 100644 --- a/tools/perf/util/top.h +++ b/tools/perf/util/top.h @@ -11,17 +11,8 @@ struct perf_evlist; struct perf_evsel; -struct source_line { - u64 eip; - unsigned long count[MAX_COUNTERS]; /* FIXME */ - char *line; - struct source_line *next; -}; - struct sym_entry_source { - struct source_line *source; - struct source_line *lines; - struct source_line **lines_tail; + struct list_head head; pthread_mutex_t lock; }; From 9c56dfeb784a586713f467e2028a127a2a58a238 Mon Sep 17 00:00:00 2001 From: Michael Witten Date: Thu, 3 Feb 2011 22:10:55 -0600 Subject: [PATCH 176/706] perf tools: Makefile: Use $(QUIET_GEN) for perf.so So that we get this: CC /home/acme/git/build/perf/bench/mem-memcpy-x86-64-asm.o GEN perf-archive * GEN /home/acme/git/build/perf/python/perf.so CC /home/acme/git/build/perf/builtin-annotate.o Instead of silently building the python binding. LKML-Reference: <1296890359-22659-1-git-send-email-mfwitten@gmail.com> Signed-off-by: Michael Witten Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/Makefile b/tools/perf/Makefile index be3eb1dc9a5a..94f73abdc56a 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -326,7 +326,7 @@ grep-libs = $(filter -l%,$(1)) strip-libs = $(filter-out -l%,$(1)) $(OUTPUT)python/perf.so: $(PYRF_OBJS) - @python util/setup.py --quiet build_ext --build-lib='$(OUTPUT)python' \ + $(QUIET_GEN)python util/setup.py --quiet build_ext --build-lib='$(OUTPUT)python' \ --build-temp='$(OUTPUT)python/temp' # # No Perl scripts right now: From ef4d001d79ac4bab6c2d81e9986a42059f877ec3 Mon Sep 17 00:00:00 2001 From: Denis Kirjanov Date: Sat, 5 Feb 2011 20:39:38 +0000 Subject: [PATCH 177/706] perf top: Use pid_t for target_{pid|tid} Use pid_t data type for target_{pid|tid} vars. Cc: Ingo Molnar LKML-Reference: <20110205203938.GA15328@hera.kernel.org> Signed-off-by: Denis Kirjanov [ committer note: those variables are now in struct perf_top, fixed ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/top.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h index fe44afb69985..62e32939f3d1 100644 --- a/tools/perf/util/top.h +++ b/tools/perf/util/top.h @@ -46,8 +46,8 @@ struct perf_top { u64 exact_samples; u64 guest_us_samples, guest_kernel_samples; int print_entries, count_filter, delay_secs; - int display_weighted, freq, rb_entries; - int sym_counter, target_pid, target_tid; + int display_weighted, freq, rb_entries, sym_counter; + pid_t target_pid, target_tid; bool hide_kernel_symbols, hide_user_symbols, zero; const char *cpu_list; struct perf_evsel *sym_evsel; From f50c2169bd054984e976e67e8651d28f3caf6ba3 Mon Sep 17 00:00:00 2001 From: Franck Bui-Huu Date: Thu, 13 Jan 2011 11:18:30 +0100 Subject: [PATCH 178/706] perf probe: Rewrite find_lazy_match_lines() by using getline(3) Acked-by: Masami Hiramatsu Cc: Masami Hiramatsu Cc: lkml LKML-Reference: Signed-off-by: Franck Bui-Huu Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-finder.c | 70 +++++++++++++--------------------- 1 file changed, 27 insertions(+), 43 deletions(-) diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index 69215bff17e9..46addfb3183e 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -1234,51 +1234,38 @@ static int find_probe_point_by_line(struct probe_finder *pf) static int find_lazy_match_lines(struct list_head *head, const char *fname, const char *pat) { - char *fbuf, *p1, *p2; - int fd, line, nlines = -1; - struct stat st; + FILE *fp; + char *line = NULL; + size_t line_len; + ssize_t len; + int count = 0, linenum = 1; - fd = open(fname, O_RDONLY); - if (fd < 0) { - pr_warning("Failed to open %s: %s\n", fname, strerror(-fd)); + fp = fopen(fname, "r"); + if (!fp) { + pr_warning("Failed to open %s: %s\n", fname, strerror(errno)); return -errno; } - if (fstat(fd, &st) < 0) { - pr_warning("Failed to get the size of %s: %s\n", - fname, strerror(errno)); - nlines = -errno; - goto out_close; + while ((len = getline(&line, &line_len, fp)) > 0) { + + if (line[len - 1] == '\n') + line[len - 1] = '\0'; + + if (strlazymatch(line, pat)) { + line_list__add_line(head, linenum); + count++; + } + linenum++; } - nlines = -ENOMEM; - fbuf = malloc(st.st_size + 2); - if (fbuf == NULL) - goto out_close; - if (read(fd, fbuf, st.st_size) < 0) { - pr_warning("Failed to read %s: %s\n", fname, strerror(errno)); - nlines = -errno; - goto out_free_fbuf; - } - fbuf[st.st_size] = '\n'; /* Dummy line */ - fbuf[st.st_size + 1] = '\0'; - p1 = fbuf; - line = 1; - nlines = 0; - while ((p2 = strchr(p1, '\n')) != NULL) { - *p2 = '\0'; - if (strlazymatch(p1, pat)) { - line_list__add_line(head, line); - nlines++; - } - line++; - p1 = p2 + 1; - } -out_free_fbuf: - free(fbuf); -out_close: - close(fd); - return nlines; + if (ferror(fp)) + count = -errno; + free(line); + fclose(fp); + + if (count == 0) + pr_debug("No matched lines found in %s.\n", fname); + return count; } static int probe_point_lazy_walker(const char *fname, int lineno, @@ -1312,10 +1299,7 @@ static int find_probe_point_lazy(Dwarf_Die *sp_die, struct probe_finder *pf) /* Matching lazy line pattern */ ret = find_lazy_match_lines(&pf->lcache, pf->fname, pf->pev->point.lazy_line); - if (ret == 0) { - pr_debug("No matched lines found in %s.\n", pf->fname); - return 0; - } else if (ret < 0) + if (ret <= 0) return ret; } From 76022db323dd6d7c6958df3d595f7dedf7a14778 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 4 Feb 2011 21:51:53 +0900 Subject: [PATCH 179/706] tracing/kprobes: Cleanup strict_strtol() using code Since strict_strtol() accepts minus digits started with '-', it doesn't need to invert after converting. Cc: 2nddept-manager@sdl.hitachi.co.jp Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Srikar Dronamraju Cc: Steven Rostedt LKML-Reference: <20110204125153.9507.49335.stgit@ltc236.sdl.hitachi.co.jp> Signed-off-by: Masami Hiramatsu Signed-off-by: Arnaldo Carvalho de Melo --- kernel/trace/trace_kprobe.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 2dec9bcde8b4..2088893c049e 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -767,16 +767,15 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, } break; case '+': /* deref memory */ + arg++; /* Skip '+', because strict_strtol() rejects it. */ case '-': tmp = strchr(arg, '('); if (!tmp) break; *tmp = '\0'; - ret = strict_strtol(arg + 1, 0, &offset); + ret = strict_strtol(arg, 0, &offset); if (ret) break; - if (arg[0] == '-') - offset = -offset; arg = tmp + 1; tmp = strrchr(arg, ')'); if (tmp) { From e3745369986ddcdaa19f70e2d24e658876b97e84 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 4 Feb 2011 21:51:59 +0900 Subject: [PATCH 180/706] tracing/kprobes: Support longer (>128 bytes) command Expand command line buffer of kprobe-tracer to 4096 bytes. Reported-by: Arnaldo Carvalho de Melo Cc: 2nddept-manager@sdl.hitachi.co.jp Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Srikar Dronamraju Cc: Steven Rostedt LKML-Reference: <20110204125159.9507.20895.stgit@ltc236.sdl.hitachi.co.jp> Signed-off-by: Masami Hiramatsu Signed-off-by: Arnaldo Carvalho de Melo --- kernel/trace/trace_kprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 2088893c049e..c6ed88660856 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1129,7 +1129,7 @@ static int command_trace_probe(const char *buf) return ret; } -#define WRITE_BUFSIZE 128 +#define WRITE_BUFSIZE 4096 static ssize_t probes_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) From 1ff511e35ed87cc2ebade9e678e4a2fe39b6f9c5 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 4 Feb 2011 21:52:05 +0900 Subject: [PATCH 181/706] tracing/kprobes: Add bitfield type Add bitfield type for tracing arguments on kprobe-tracer. The syntax of a bitfield type is: b@/ e.g. Accessing 2 bits-width field with 4 bits-offset in 32 bits-width data at 4 bytes offseted from the address pointed by AX register: +4(%ax):b2@4/32 Since the width of container data depends on the arch, so I just added the container-size at the end. Cc: 2nddept-manager@sdl.hitachi.co.jp Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Srikar Dronamraju Cc: Steven Rostedt LKML-Reference: <20110204125205.9507.11363.stgit@ltc236.sdl.hitachi.co.jp> Signed-off-by: Masami Hiramatsu Signed-off-by: Arnaldo Carvalho de Melo --- Documentation/trace/kprobetrace.txt | 16 ++++- kernel/trace/trace_kprobe.c | 104 +++++++++++++++++++++++++++- 2 files changed, 118 insertions(+), 2 deletions(-) diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index 5f77d94598dd..6d27ab8d6e9f 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt @@ -42,11 +42,25 @@ Synopsis of kprobe_events +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**) NAME=FETCHARG : Set NAME as the argument name of FETCHARG. FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types - (u8/u16/u32/u64/s8/s16/s32/s64) and string are supported. + (u8/u16/u32/u64/s8/s16/s32/s64), "string" and bitfield + are supported. (*) only for return probe. (**) this is useful for fetching a field of data structures. +Types +----- +Several types are supported for fetch-args. Kprobe tracer will access memory +by given type. Prefix 's' and 'u' means those types are signed and unsigned +respectively. Traced arguments are shown in decimal (signed) or hex (unsigned). +String type is a special type, which fetches a "null-terminated" string from +kernel space. This means it will fail and store NULL if the string container +has been paged out. +Bitfield is another special type, which takes 3 parameters, bit-width, bit- +offset, and container-size (usually 32). The syntax is; + + b@/ + Per-Probe Event Filtering ------------------------- diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c6ed88660856..ccdc542022c3 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -353,6 +353,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) kfree(data); } +/* Bitfield fetch function */ +struct bitfield_fetch_param { + struct fetch_param orig; + unsigned char hi_shift; + unsigned char low_shift; +}; + +#define DEFINE_FETCH_bitfield(type) \ +static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\ + void *data, void *dest) \ +{ \ + struct bitfield_fetch_param *bprm = data; \ + type buf = 0; \ + call_fetch(&bprm->orig, regs, &buf); \ + if (buf) { \ + buf <<= bprm->hi_shift; \ + buf >>= bprm->low_shift; \ + } \ + *(type *)dest = buf; \ +} +DEFINE_BASIC_FETCH_FUNCS(bitfield) +#define fetch_bitfield_string NULL +#define fetch_bitfield_string_size NULL + +static __kprobes void +free_bitfield_fetch_param(struct bitfield_fetch_param *data) +{ + /* + * Don't check the bitfield itself, because this must be the + * last fetch function. + */ + if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) + free_deref_fetch_param(data->orig.data); + else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) + free_symbol_cache(data->orig.data); + kfree(data); +} /* Default (unsigned long) fetch type */ #define __DEFAULT_FETCH_TYPE(t) u##t #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) @@ -367,6 +404,7 @@ enum { FETCH_MTD_memory, FETCH_MTD_symbol, FETCH_MTD_deref, + FETCH_MTD_bitfield, FETCH_MTD_END, }; @@ -387,6 +425,7 @@ ASSIGN_FETCH_FUNC(retval, ftype), \ ASSIGN_FETCH_FUNC(memory, ftype), \ ASSIGN_FETCH_FUNC(symbol, ftype), \ ASSIGN_FETCH_FUNC(deref, ftype), \ +ASSIGN_FETCH_FUNC(bitfield, ftype), \ } \ } @@ -430,9 +469,33 @@ static const struct fetch_type *find_fetch_type(const char *type) if (!type) type = DEFAULT_FETCH_TYPE_STR; + /* Special case: bitfield */ + if (*type == 'b') { + unsigned long bs; + type = strchr(type, '/'); + if (!type) + goto fail; + type++; + if (strict_strtoul(type, 0, &bs)) + goto fail; + switch (bs) { + case 8: + return find_fetch_type("u8"); + case 16: + return find_fetch_type("u16"); + case 32: + return find_fetch_type("u32"); + case 64: + return find_fetch_type("u64"); + default: + goto fail; + } + } + for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) if (strcmp(type, fetch_type_table[i].name) == 0) return &fetch_type_table[i]; +fail: return NULL; } @@ -586,7 +649,9 @@ error: static void free_probe_arg(struct probe_arg *arg) { - if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) + if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) + free_bitfield_fetch_param(arg->fetch.data); + else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) free_deref_fetch_param(arg->fetch.data); else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) free_symbol_cache(arg->fetch.data); @@ -806,6 +871,41 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, return ret; } +#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long)) + +/* Bitfield type needs to be parsed into a fetch function */ +static int __parse_bitfield_probe_arg(const char *bf, + const struct fetch_type *t, + struct fetch_param *f) +{ + struct bitfield_fetch_param *bprm; + unsigned long bw, bo; + char *tail; + + if (*bf != 'b') + return 0; + + bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); + if (!bprm) + return -ENOMEM; + bprm->orig = *f; + f->fn = t->fetch[FETCH_MTD_bitfield]; + f->data = (void *)bprm; + + bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */ + if (bw == 0 || *tail != '@') + return -EINVAL; + + bf = tail + 1; + bo = simple_strtoul(bf, &tail, 0); + if (tail == bf || *tail != '/') + return -EINVAL; + + bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo); + bprm->low_shift = bprm->hi_shift + bo; + return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0; +} + /* String length checking wrapper */ static int parse_probe_arg(char *arg, struct trace_probe *tp, struct probe_arg *parg, int is_return) @@ -835,6 +935,8 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp, parg->offset = tp->size; tp->size += parg->type->size; ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); + if (ret >= 0) + ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); if (ret >= 0) { parg->fetch_size.fn = get_fetch_size_function(parg->type, parg->fetch.fn); From fb7d0b3cefb80a105f7fd26bbc62e0cbf9192822 Mon Sep 17 00:00:00 2001 From: Kyle McMartin Date: Mon, 24 Jan 2011 11:13:04 -0500 Subject: [PATCH 182/706] perf tool: Fix gcc 4.6.0 issues GCC 4.6.0 in Fedora rawhide turned up some compile errors in tools/perf due to the -Werror=unused-but-set-variable flag. I've gone through and annotated some of the assignments that had side effects (ie: return value from a function) with the __used annotation, and in some cases, just removed unused code. In a few cases, we were assigning something useful, but not using it in later parts of the function. kyle@dreadnought:~/src% gcc --version gcc (GCC) 4.6.0 20110122 (Red Hat 4.6.0-0.3) Cc: Ingo Molnar LKML-Reference: <20110124161304.GK27353@bombadil.infradead.org> Signed-off-by: Kyle McMartin [ committer note: Fixed up the annotation fixes, as that code moved recently ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/bench/sched-pipe.c | 2 +- tools/perf/builtin-sched.c | 12 +++--------- tools/perf/builtin-top.c | 2 +- tools/perf/util/annotate.c | 3 --- tools/perf/util/header.c | 2 +- .../perf/util/scripting-engines/trace-event-python.c | 3 +-- tools/perf/util/symbol.c | 4 ++-- tools/perf/util/trace-event-parse.c | 2 +- tools/perf/util/ui/browsers/map.c | 2 +- 9 files changed, 11 insertions(+), 21 deletions(-) diff --git a/tools/perf/bench/sched-pipe.c b/tools/perf/bench/sched-pipe.c index d9ab3ce446ac..0c7454f8b8a9 100644 --- a/tools/perf/bench/sched-pipe.c +++ b/tools/perf/bench/sched-pipe.c @@ -55,7 +55,7 @@ int bench_sched_pipe(int argc, const char **argv, * discarding returned value of read(), write() * causes error in building environment for perf */ - int ret, wait_stat; + int __used ret, wait_stat; pid_t pid, retpid; argc = parse_options(argc, argv, options, diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index ae2621182927..a32f411faeac 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -369,11 +369,6 @@ static void process_sched_event(struct task_desc *this_task __used, struct sched_atom *atom) { int ret = 0; - u64 now; - long long delta; - - now = get_nsecs(); - delta = start_time + atom->timestamp - now; switch (atom->type) { case SCHED_EVENT_RUN: @@ -562,7 +557,7 @@ static void wait_for_tasks(void) static void run_one_test(void) { - u64 T0, T1, delta, avg_delta, fluct, std_dev; + u64 T0, T1, delta, avg_delta, fluct; T0 = get_nsecs(); wait_for_tasks(); @@ -578,7 +573,6 @@ static void run_one_test(void) else fluct = delta - avg_delta; sum_fluct += fluct; - std_dev = sum_fluct / nr_runs / sqrt(nr_runs); if (!run_avg) run_avg = delta; run_avg = (run_avg*9 + delta)/10; @@ -799,7 +793,7 @@ replay_switch_event(struct trace_switch_event *switch_event, u64 timestamp, struct thread *thread __used) { - struct task_desc *prev, *next; + struct task_desc *prev, __used *next; u64 timestamp0; s64 delta; @@ -1404,7 +1398,7 @@ map_switch_event(struct trace_switch_event *switch_event, u64 timestamp, struct thread *thread __used) { - struct thread *sched_out, *sched_in; + struct thread *sched_out __used, *sched_in; int new_shortname; u64 timestamp0; s64 delta; diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 716118a3b3e4..b790673cb0aa 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -865,7 +865,7 @@ static int __cmd_top(void) { pthread_t thread; struct perf_evsel *first; - int ret; + int ret __used; /* * FIXME: perf_session__new should allow passing a O_MMAP, so that all this * mmap reading, etc is encapsulated in it. Use O_WRONLY for now. diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 297337649c21..1012841835a3 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -236,7 +236,6 @@ int symbol__annotate(struct symbol *sym, struct map *map, char command[PATH_MAX * 2]; FILE *file; int err = 0; - u64 len; char symfs_filename[PATH_MAX]; if (filename) { @@ -281,8 +280,6 @@ fallback: filename, sym->name, map->unmap_ip(map, sym->start), map->unmap_ip(map, sym->end)); - len = sym->end - sym->start; - pr_debug("annotating [%p] %30s : [%p] %30s\n", dso, dso->long_name, sym, sym->name); diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index c0de5ec44145..72c124dc5781 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1145,7 +1145,7 @@ int perf_event__synthesize_tracing_data(int fd, struct perf_evlist *evlist, { union perf_event ev; ssize_t size = 0, aligned_size = 0, padding; - int err = 0; + int err __used = 0; memset(&ev, 0, sizeof(ev)); diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index c6d99334bdfa..2040b8538527 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -248,8 +248,7 @@ static void python_process_event(int cpu, void *data, context = PyCObject_FromVoidPtr(scripting_context, NULL); PyTuple_SetItem(t, n++, PyString_FromString(handler_name)); - PyTuple_SetItem(t, n++, - PyCObject_FromVoidPtr(scripting_context, NULL)); + PyTuple_SetItem(t, n++, context); if (handler) { PyTuple_SetItem(t, n++, PyInt_FromLong(cpu)); diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 7821d0e6866f..3e193f8e3061 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -1525,8 +1525,8 @@ int dso__load(struct dso *self, struct map *map, symbol_filter_t filter) symbol_conf.symfs, self->long_name); break; case DSO__ORIG_GUEST_KMODULE: - if (map->groups && map->groups->machine) - root_dir = map->groups->machine->root_dir; + if (map->groups && machine) + root_dir = machine->root_dir; else root_dir = ""; snprintf(name, size, "%s%s%s", symbol_conf.symfs, diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c index 73a02223c629..d8e622dd738a 100644 --- a/tools/perf/util/trace-event-parse.c +++ b/tools/perf/util/trace-event-parse.c @@ -153,7 +153,7 @@ void parse_proc_kallsyms(char *file, unsigned int size __unused) char *next = NULL; char *addr_str; char ch; - int ret; + int ret __used; int i; line = strtok_r(file, "\n", &next); diff --git a/tools/perf/util/ui/browsers/map.c b/tools/perf/util/ui/browsers/map.c index e5158369106e..8462bffe20bc 100644 --- a/tools/perf/util/ui/browsers/map.c +++ b/tools/perf/util/ui/browsers/map.c @@ -41,7 +41,7 @@ static int ui_entry__read(const char *title, char *bf, size_t size, int width) out_free_form: newtPopWindow(); newtFormDestroy(form); - return 0; + return err; } struct map_browser { From a2221796256ea7b236cec6bf027c1c1de5b8ccd7 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 7 Feb 2011 15:32:18 +0100 Subject: [PATCH 183/706] perf annotate: Fix build error A small fix for when NO_NEWT_SUPPORT is defined. Add a missing "struct" to the function prototype. Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Tom Zanussi LKML-Reference: <20110207143218.GA31197@kryptos.osrc.amd.com> Signed-off-by: Borislav Petkov Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index b1253aadf340..bc08b36a713a 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -82,7 +82,7 @@ int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx, int max_lines); #ifdef NO_NEWT_SUPPORT -static inline int symbol__tui_annotate(symbol *sym __used, +static inline int symbol__tui_annotate(struct symbol *sym __used, struct map *map __used, int evidx __used) { return 0; From 124bb83cd7de4d851af7595650233fb9e9279d5d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 4 Feb 2011 21:52:11 +0900 Subject: [PATCH 184/706] perf probe: Add bitfield member support Add bitfield member accessing support to probe arguments. Suggested-by: Arnaldo Carvalho de Melo Cc: 2nddept-manager@sdl.hitachi.co.jp Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Steven Rostedt LKML-Reference: <20110204125211.9507.60265.stgit@ltc236.sdl.hitachi.co.jp> Signed-off-by: Masami Hiramatsu [ committer note: Fixed up '%lu' use for return of BYTES_TO_BITS ('%zd') ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-finder.c | 95 ++++++++++++++++++++++++---------- 1 file changed, 68 insertions(+), 27 deletions(-) diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index 46addfb3183e..fe461f6559f1 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -33,6 +33,7 @@ #include #include +#include #include "event.h" #include "debug.h" #include "util.h" @@ -333,13 +334,23 @@ static Dwarf_Die *die_get_real_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem) return vr_die; } -static bool die_is_signed_type(Dwarf_Die *tp_die) +static int die_get_attr_udata(Dwarf_Die *tp_die, unsigned int attr_name, + Dwarf_Word *result) { Dwarf_Attribute attr; + + if (dwarf_attr(tp_die, attr_name, &attr) == NULL || + dwarf_formudata(&attr, result) != 0) + return -ENOENT; + + return 0; +} + +static bool die_is_signed_type(Dwarf_Die *tp_die) +{ Dwarf_Word ret; - if (dwarf_attr(tp_die, DW_AT_encoding, &attr) == NULL || - dwarf_formudata(&attr, &ret) != 0) + if (die_get_attr_udata(tp_die, DW_AT_encoding, &ret)) return false; return (ret == DW_ATE_signed_char || ret == DW_ATE_signed || @@ -348,11 +359,29 @@ static bool die_is_signed_type(Dwarf_Die *tp_die) static int die_get_byte_size(Dwarf_Die *tp_die) { - Dwarf_Attribute attr; Dwarf_Word ret; - if (dwarf_attr(tp_die, DW_AT_byte_size, &attr) == NULL || - dwarf_formudata(&attr, &ret) != 0) + if (die_get_attr_udata(tp_die, DW_AT_byte_size, &ret)) + return 0; + + return (int)ret; +} + +static int die_get_bit_size(Dwarf_Die *tp_die) +{ + Dwarf_Word ret; + + if (die_get_attr_udata(tp_die, DW_AT_bit_size, &ret)) + return 0; + + return (int)ret; +} + +static int die_get_bit_offset(Dwarf_Die *tp_die) +{ + Dwarf_Word ret; + + if (die_get_attr_udata(tp_die, DW_AT_bit_offset, &ret)) return 0; return (int)ret; @@ -827,6 +856,8 @@ static_var: return 0; } +#define BYTES_TO_BITS(nb) ((nb) * BITS_PER_LONG / sizeof(long)) + static int convert_variable_type(Dwarf_Die *vr_die, struct probe_trace_arg *tvar, const char *cast) @@ -843,6 +874,14 @@ static int convert_variable_type(Dwarf_Die *vr_die, return (tvar->type == NULL) ? -ENOMEM : 0; } + if (die_get_bit_size(vr_die) != 0) { + /* This is a bitfield */ + ret = snprintf(buf, 16, "b%d@%d/%zd", die_get_bit_size(vr_die), + die_get_bit_offset(vr_die), + BYTES_TO_BITS(die_get_byte_size(vr_die))); + goto formatted; + } + if (die_get_real_type(vr_die, &type) == NULL) { pr_warning("Failed to get a type information of %s.\n", dwarf_diename(vr_die)); @@ -887,29 +926,31 @@ static int convert_variable_type(Dwarf_Die *vr_die, return (tvar->type == NULL) ? -ENOMEM : 0; } - ret = die_get_byte_size(&type) * 8; - if (ret) { - /* Check the bitwidth */ - if (ret > MAX_BASIC_TYPE_BITS) { - pr_info("%s exceeds max-bitwidth." - " Cut down to %d bits.\n", - dwarf_diename(&type), MAX_BASIC_TYPE_BITS); - ret = MAX_BASIC_TYPE_BITS; - } + ret = BYTES_TO_BITS(die_get_byte_size(&type)); + if (!ret) + /* No size ... try to use default type */ + return 0; - ret = snprintf(buf, 16, "%c%d", - die_is_signed_type(&type) ? 's' : 'u', ret); - if (ret < 0 || ret >= 16) { - if (ret >= 16) - ret = -E2BIG; - pr_warning("Failed to convert variable type: %s\n", - strerror(-ret)); - return ret; - } - tvar->type = strdup(buf); - if (tvar->type == NULL) - return -ENOMEM; + /* Check the bitwidth */ + if (ret > MAX_BASIC_TYPE_BITS) { + pr_info("%s exceeds max-bitwidth. Cut down to %d bits.\n", + dwarf_diename(&type), MAX_BASIC_TYPE_BITS); + ret = MAX_BASIC_TYPE_BITS; } + ret = snprintf(buf, 16, "%c%d", + die_is_signed_type(&type) ? 's' : 'u', ret); + +formatted: + if (ret < 0 || ret >= 16) { + if (ret >= 16) + ret = -E2BIG; + pr_warning("Failed to convert variable type: %s\n", + strerror(-ret)); + return ret; + } + tvar->type = strdup(buf); + if (tvar->type == NULL) + return -ENOMEM; return 0; } From 6d54057d76e25c91165cda0e6e007f1811faa2be Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 22:33:26 -0500 Subject: [PATCH 185/706] tracing/filter: Have no filter return a match The n_preds field of a file can change at anytime, and even can become zero, just as the filter is about to be processed by an event. In the case that is zero on entering the filter, return 1, telling the caller the event matchs and should be trace. Also use a variable and assign it with ACCESS_ONCE() such that the count stays consistent within the function. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 36d40104b17f..7275f0310ed8 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -383,9 +383,14 @@ int filter_match_preds(struct event_filter *filter, void *rec) int match, top = 0, val1 = 0, val2 = 0; int stack[MAX_FILTER_PRED]; struct filter_pred *pred; + int n_preds = ACCESS_ONCE(filter->n_preds); int i; - for (i = 0; i < filter->n_preds; i++) { + /* no filter is considered a match */ + if (!n_preds) + return 1; + + for (i = 0; i < n_preds; i++) { pred = filter->preds[i]; if (!pred->pop_n) { match = pred->fn(pred, rec, val1, val2); From 58d9a597c4275d830a819625e7d437cd6fb23fa5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 22:37:09 -0500 Subject: [PATCH 186/706] tracing/filter: Move OR and AND logic out of fn() method The ops OR and AND act different from the other ops, as they are the only ones to take other ops as their arguements. These ops als change the logic of the filter_match_preds. By removing the OR and AND fn's we can also remove the val1 and val2 that is passed to all other fn's and are unused. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 3 +- kernel/trace/trace_events_filter.c | 51 +++++++++++------------------- 2 files changed, 20 insertions(+), 34 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9021f8c0c0c3..1597bc0749c1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -677,8 +677,7 @@ struct event_subsystem { struct filter_pred; struct regex; -typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, - int val1, int val2); +typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); typedef int (*regex_match_func)(char *str, struct regex *r, int len); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 7275f0310ed8..5d719b340a2b 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -124,8 +124,7 @@ struct filter_parse_state { }; #define DEFINE_COMPARISON_PRED(type) \ -static int filter_pred_##type(struct filter_pred *pred, void *event, \ - int val1, int val2) \ +static int filter_pred_##type(struct filter_pred *pred, void *event) \ { \ type *addr = (type *)(event + pred->offset); \ type val = (type)pred->val; \ @@ -152,8 +151,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event, \ } #define DEFINE_EQUALITY_PRED(size) \ -static int filter_pred_##size(struct filter_pred *pred, void *event, \ - int val1, int val2) \ +static int filter_pred_##size(struct filter_pred *pred, void *event) \ { \ u##size *addr = (u##size *)(event + pred->offset); \ u##size val = (u##size)pred->val; \ @@ -178,23 +176,8 @@ DEFINE_EQUALITY_PRED(32); DEFINE_EQUALITY_PRED(16); DEFINE_EQUALITY_PRED(8); -static int filter_pred_and(struct filter_pred *pred __attribute((unused)), - void *event __attribute((unused)), - int val1, int val2) -{ - return val1 && val2; -} - -static int filter_pred_or(struct filter_pred *pred __attribute((unused)), - void *event __attribute((unused)), - int val1, int val2) -{ - return val1 || val2; -} - /* Filter predicate for fixed sized arrays of characters */ -static int filter_pred_string(struct filter_pred *pred, void *event, - int val1, int val2) +static int filter_pred_string(struct filter_pred *pred, void *event) { char *addr = (char *)(event + pred->offset); int cmp, match; @@ -207,8 +190,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event, } /* Filter predicate for char * pointers */ -static int filter_pred_pchar(struct filter_pred *pred, void *event, - int val1, int val2) +static int filter_pred_pchar(struct filter_pred *pred, void *event) { char **addr = (char **)(event + pred->offset); int cmp, match; @@ -231,8 +213,7 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event, * and add it to the address of the entry, and at last we have * the address of the string. */ -static int filter_pred_strloc(struct filter_pred *pred, void *event, - int val1, int val2) +static int filter_pred_strloc(struct filter_pred *pred, void *event) { u32 str_item = *(u32 *)(event + pred->offset); int str_loc = str_item & 0xffff; @@ -247,8 +228,7 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event, return match; } -static int filter_pred_none(struct filter_pred *pred, void *event, - int val1, int val2) +static int filter_pred_none(struct filter_pred *pred, void *event) { return 0; } @@ -380,7 +360,7 @@ static void filter_build_regex(struct filter_pred *pred) /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct event_filter *filter, void *rec) { - int match, top = 0, val1 = 0, val2 = 0; + int match = -1, top = 0, val1 = 0, val2 = 0; int stack[MAX_FILTER_PRED]; struct filter_pred *pred; int n_preds = ACCESS_ONCE(filter->n_preds); @@ -393,7 +373,7 @@ int filter_match_preds(struct event_filter *filter, void *rec) for (i = 0; i < n_preds; i++) { pred = filter->preds[i]; if (!pred->pop_n) { - match = pred->fn(pred, rec, val1, val2); + match = pred->fn(pred, rec); stack[top++] = match; continue; } @@ -403,7 +383,16 @@ int filter_match_preds(struct event_filter *filter, void *rec) } val1 = stack[--top]; val2 = stack[--top]; - match = pred->fn(pred, rec, val1, val2); + switch (pred->op) { + case OP_AND: + match = val1 && val2; + break; + case OP_OR: + match = val1 || val2; + break; + default: + WARN_ONCE(1, "filter op is not AND or OR"); + } stack[top++] = match; } @@ -775,15 +764,13 @@ static int filter_add_pred(struct filter_parse_state *ps, unsigned long long val; int ret; - pred->fn = filter_pred_none; + fn = pred->fn = filter_pred_none; if (pred->op == OP_AND) { pred->pop_n = 2; - fn = filter_pred_and; goto add_pred_fn; } else if (pred->op == OP_OR) { pred->pop_n = 2; - fn = filter_pred_or; goto add_pred_fn; } From c9c53ca03d6f97fdd9832d5ed3f15b30ee5cdb86 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 22:42:43 -0500 Subject: [PATCH 187/706] tracing/filter: Dynamically allocate preds For every filter that is made, we create predicates to hold every operation within the filter. We have a max of 32 predicates that we can hold. Currently, we allocate all 32 even if we only need to use one. Part of the reason we do this is that the filter can be used at any moment by any event. Fortunately, the filter is only used with preemption disabled. By reseting the count of preds used "n_preds" to zero, then performing a synchronize_sched(), we can safely free and reallocate a new array of preds. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 3 +- kernel/trace/trace_events_filter.c | 145 ++++++++++++++++++++++------- 2 files changed, 111 insertions(+), 37 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1597bc0749c1..441fc1bc85d6 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -661,7 +661,8 @@ struct ftrace_event_field { }; struct event_filter { - int n_preds; + int n_preds; /* Number assigned */ + int a_preds; /* allocated */ struct filter_pred **preds; char *filter_string; }; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 5d719b340a2b..aac6a6183e6a 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -362,6 +362,7 @@ int filter_match_preds(struct event_filter *filter, void *rec) { int match = -1, top = 0, val1 = 0, val2 = 0; int stack[MAX_FILTER_PRED]; + struct filter_pred **preds; struct filter_pred *pred; int n_preds = ACCESS_ONCE(filter->n_preds); int i; @@ -370,8 +371,13 @@ int filter_match_preds(struct event_filter *filter, void *rec) if (!n_preds) return 1; + /* + * n_preds and filter->preds is protect with preemption disabled. + */ + preds = rcu_dereference_sched(filter->preds); + for (i = 0; i < n_preds; i++) { - pred = filter->preds[i]; + pred = preds[i]; if (!pred->pop_n) { match = pred->fn(pred, rec); stack[top++] = match; @@ -548,46 +554,55 @@ static int filter_set_pred(struct filter_pred *dest, return 0; } +static void __free_preds(struct event_filter *filter) +{ + int i; + + if (filter->preds) { + for (i = 0; i < filter->a_preds; i++) { + if (filter->preds[i]) + filter_free_pred(filter->preds[i]); + } + kfree(filter->preds); + filter->preds = NULL; + } + filter->a_preds = 0; + filter->n_preds = 0; +} + static void filter_disable_preds(struct ftrace_event_call *call) { struct event_filter *filter = call->filter; int i; call->flags &= ~TRACE_EVENT_FL_FILTERED; + if (filter->preds) { + for (i = 0; i < filter->n_preds; i++) + filter->preds[i]->fn = filter_pred_none; + } filter->n_preds = 0; - - for (i = 0; i < MAX_FILTER_PRED; i++) - filter->preds[i]->fn = filter_pred_none; } -static void __free_preds(struct event_filter *filter) +static void __free_filter(struct event_filter *filter) { - int i; - if (!filter) return; - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (filter->preds[i]) - filter_free_pred(filter->preds[i]); - } - kfree(filter->preds); + __free_preds(filter); kfree(filter->filter_string); kfree(filter); } void destroy_preds(struct ftrace_event_call *call) { - __free_preds(call->filter); + __free_filter(call->filter); call->filter = NULL; call->flags &= ~TRACE_EVENT_FL_FILTERED; } -static struct event_filter *__alloc_preds(void) +static struct event_filter *__alloc_filter(void) { struct event_filter *filter; - struct filter_pred *pred; - int i; filter = kzalloc(sizeof(*filter), GFP_KERNEL); if (!filter) @@ -595,32 +610,63 @@ static struct event_filter *__alloc_preds(void) filter->n_preds = 0; - filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); - if (!filter->preds) - goto oom; + return filter; +} - for (i = 0; i < MAX_FILTER_PRED; i++) { - pred = kzalloc(sizeof(*pred), GFP_KERNEL); +static int __alloc_preds(struct event_filter *filter, int n_preds) +{ + struct filter_pred *pred; + int i; + + if (filter->preds) { + if (filter->a_preds < n_preds) { + /* We need to reallocate */ + filter->n_preds = 0; + /* + * It is possible that the filter is currently + * being used. We need to zero out the number + * of preds, wait on preemption and then free + * the preds. + */ + synchronize_sched(); + __free_preds(filter); + } + } + + if (!filter->preds) { + filter->preds = + kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL); + filter->a_preds = n_preds; + } + if (!filter->preds) + return -ENOMEM; + + if (WARN_ON(filter->a_preds < n_preds)) + return -EINVAL; + + for (i = 0; i < n_preds; i++) { + pred = filter->preds[i]; + if (!pred) + pred = kzalloc(sizeof(*pred), GFP_KERNEL); if (!pred) goto oom; pred->fn = filter_pred_none; filter->preds[i] = pred; } - return filter; - -oom: + return 0; + oom: __free_preds(filter); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } -static int init_preds(struct ftrace_event_call *call) +static int init_filter(struct ftrace_event_call *call) { if (call->filter) return 0; call->flags &= ~TRACE_EVENT_FL_FILTERED; - call->filter = __alloc_preds(); + call->filter = __alloc_filter(); if (IS_ERR(call->filter)) return PTR_ERR(call->filter); @@ -636,7 +682,7 @@ static int init_subsystem_preds(struct event_subsystem *system) if (strcmp(call->class->system, system->name) != 0) continue; - err = init_preds(call); + err = init_filter(call); if (err) return err; } @@ -665,7 +711,7 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, { int idx, err; - if (filter->n_preds == MAX_FILTER_PRED) { + if (WARN_ON(filter->n_preds == filter->a_preds)) { parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); return -ENOSPC; } @@ -1179,6 +1225,20 @@ static int check_preds(struct filter_parse_state *ps) return 0; } +static int count_preds(struct filter_parse_state *ps) +{ + struct postfix_elt *elt; + int n_preds = 0; + + list_for_each_entry(elt, &ps->postfix, list) { + if (elt->op == OP_NONE) + continue; + n_preds++; + } + + return n_preds; +} + static int replace_preds(struct ftrace_event_call *call, struct event_filter *filter, struct filter_parse_state *ps, @@ -1191,10 +1251,23 @@ static int replace_preds(struct ftrace_event_call *call, int err; int n_preds = 0; + n_preds = count_preds(ps); + if (n_preds >= MAX_FILTER_PRED) { + parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); + return -ENOSPC; + } + err = check_preds(ps); if (err) return err; + if (!dry_run) { + err = __alloc_preds(filter, n_preds); + if (err) + return err; + } + + n_preds = 0; list_for_each_entry(elt, &ps->postfix, list) { if (elt->op == OP_NONE) { if (!operand1) @@ -1208,7 +1281,7 @@ static int replace_preds(struct ftrace_event_call *call, continue; } - if (n_preds++ == MAX_FILTER_PRED) { + if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); return -ENOSPC; } @@ -1283,7 +1356,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) mutex_lock(&event_mutex); - err = init_preds(call); + err = init_filter(call); if (err) goto out_unlock; @@ -1376,7 +1449,7 @@ void ftrace_profile_free_filter(struct perf_event *event) struct event_filter *filter = event->filter; event->filter = NULL; - __free_preds(filter); + __free_filter(filter); } int ftrace_profile_set_filter(struct perf_event *event, int event_id, @@ -1402,7 +1475,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, if (event->filter) goto out_unlock; - filter = __alloc_preds(); + filter = __alloc_filter(); if (IS_ERR(filter)) { err = PTR_ERR(filter); goto out_unlock; @@ -1411,7 +1484,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, err = -ENOMEM; ps = kzalloc(sizeof(*ps), GFP_KERNEL); if (!ps) - goto free_preds; + goto free_filter; parse_init(ps, filter_ops, filter_str); err = filter_parse(ps); @@ -1427,9 +1500,9 @@ free_ps: postfix_clear(ps); kfree(ps); -free_preds: +free_filter: if (err) - __free_preds(filter); + __free_filter(filter); out_unlock: mutex_unlock(&event_mutex); From 0fc3ca9a10a61a77f18710fb708b41fd99c79a56 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 22:46:46 -0500 Subject: [PATCH 188/706] tracing/filter: Call synchronize_sched() just once for system filters By separating out the reseting of the filter->n_preds to zero from the reallocation of preds for the filter, we can reset groups of filters first, call synchronize_sched() just once, and then reallocate each of the filters in the system group. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 80 +++++++++++++++++++++++------- 1 file changed, 63 insertions(+), 17 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index aac6a6183e6a..8f00a11ce778 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -570,17 +570,28 @@ static void __free_preds(struct event_filter *filter) filter->n_preds = 0; } +static void reset_preds(struct event_filter *filter) +{ + struct filter_pred *pred; + int n_preds = filter->n_preds; + int i; + + filter->n_preds = 0; + if (!filter->preds) + return; + + for (i = 0; i < n_preds; i++) { + pred = filter->preds[i]; + pred->fn = filter_pred_none; + } +} + static void filter_disable_preds(struct ftrace_event_call *call) { struct event_filter *filter = call->filter; - int i; call->flags &= ~TRACE_EVENT_FL_FILTERED; - if (filter->preds) { - for (i = 0; i < filter->n_preds; i++) - filter->preds[i]->fn = filter_pred_none; - } - filter->n_preds = 0; + reset_preds(filter); } static void __free_filter(struct event_filter *filter) @@ -620,15 +631,17 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) if (filter->preds) { if (filter->a_preds < n_preds) { - /* We need to reallocate */ - filter->n_preds = 0; /* - * It is possible that the filter is currently - * being used. We need to zero out the number - * of preds, wait on preemption and then free - * the preds. + * We need to reallocate. + * We should have already have zeroed out + * the pred count and called synchronized_sched() + * to make sure no one is using the preds. */ - synchronize_sched(); + if (WARN_ON_ONCE(filter->n_preds)) { + /* We need to reset it now */ + filter->n_preds = 0; + synchronize_sched(); + } __free_preds(filter); } } @@ -1328,6 +1341,30 @@ static int replace_system_preds(struct event_subsystem *system, /* try to see if the filter can be applied */ err = replace_preds(call, filter, ps, filter_string, true); if (err) + goto fail; + } + + /* set all filter pred counts to zero */ + list_for_each_entry(call, &ftrace_events, list) { + struct event_filter *filter = call->filter; + + if (strcmp(call->class->system, system->name) != 0) + continue; + + reset_preds(filter); + } + + /* + * Since some of the preds may be used under preemption + * we need to wait for them to finish before we may + * reallocate them. + */ + synchronize_sched(); + + list_for_each_entry(call, &ftrace_events, list) { + struct event_filter *filter = call->filter; + + if (strcmp(call->class->system, system->name) != 0) continue; /* really apply the filter */ @@ -1342,11 +1379,13 @@ static int replace_system_preds(struct event_subsystem *system, fail = false; } - if (fail) { - parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); - return -EINVAL; - } + if (fail) + goto fail; + return 0; + fail: + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + return -EINVAL; } int apply_event_filter(struct ftrace_event_call *call, char *filter_string) @@ -1381,6 +1420,13 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) goto out; } + /* + * Make sure all the pred counts are zero so that + * no task is using it when we reallocate the preds array. + */ + reset_preds(call->filter); + synchronize_sched(); + err = replace_preds(call, call->filter, ps, filter_string, false); if (err) append_filter_err(ps, call->filter); From 74e9e58c350a24139e268dd6857bbaa55c5aafcf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 22:49:48 -0500 Subject: [PATCH 189/706] tracing/filter: Allocate the preds in an array Currently we allocate an array of pointers to filter_preds, and then allocate a separate filter_pred for each item in the array. This adds slight overhead in the filters as it needs to derefernce twice to get to the op condition. Allocating the preds themselves in a single array removes a dereference as well as helps on the cache footprint. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 2 +- kernel/trace/trace_events_filter.c | 31 +++++++++--------------------- 2 files changed, 10 insertions(+), 23 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 441fc1bc85d6..254d04a84ec3 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -663,7 +663,7 @@ struct ftrace_event_field { struct event_filter { int n_preds; /* Number assigned */ int a_preds; /* allocated */ - struct filter_pred **preds; + struct filter_pred *preds; char *filter_string; }; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 8f00a11ce778..b6c910642a1e 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -362,7 +362,7 @@ int filter_match_preds(struct event_filter *filter, void *rec) { int match = -1, top = 0, val1 = 0, val2 = 0; int stack[MAX_FILTER_PRED]; - struct filter_pred **preds; + struct filter_pred *preds; struct filter_pred *pred; int n_preds = ACCESS_ONCE(filter->n_preds); int i; @@ -377,7 +377,7 @@ int filter_match_preds(struct event_filter *filter, void *rec) preds = rcu_dereference_sched(filter->preds); for (i = 0; i < n_preds; i++) { - pred = preds[i]; + pred = &preds[i]; if (!pred->pop_n) { match = pred->fn(pred, rec); stack[top++] = match; @@ -559,10 +559,8 @@ static void __free_preds(struct event_filter *filter) int i; if (filter->preds) { - for (i = 0; i < filter->a_preds; i++) { - if (filter->preds[i]) - filter_free_pred(filter->preds[i]); - } + for (i = 0; i < filter->a_preds; i++) + kfree(filter->preds[i].field_name); kfree(filter->preds); filter->preds = NULL; } @@ -572,7 +570,6 @@ static void __free_preds(struct event_filter *filter) static void reset_preds(struct event_filter *filter) { - struct filter_pred *pred; int n_preds = filter->n_preds; int i; @@ -580,10 +577,8 @@ static void reset_preds(struct event_filter *filter) if (!filter->preds) return; - for (i = 0; i < n_preds; i++) { - pred = filter->preds[i]; - pred->fn = filter_pred_none; - } + for (i = 0; i < n_preds; i++) + filter->preds[i].fn = filter_pred_none; } static void filter_disable_preds(struct ftrace_event_call *call) @@ -658,19 +653,11 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) return -EINVAL; for (i = 0; i < n_preds; i++) { - pred = filter->preds[i]; - if (!pred) - pred = kzalloc(sizeof(*pred), GFP_KERNEL); - if (!pred) - goto oom; + pred = &filter->preds[i]; pred->fn = filter_pred_none; - filter->preds[i] = pred; } return 0; - oom: - __free_preds(filter); - return -ENOMEM; } static int init_filter(struct ftrace_event_call *call) @@ -730,8 +717,8 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, } idx = filter->n_preds; - filter_clear_pred(filter->preds[idx]); - err = filter_set_pred(filter->preds[idx], pred, fn); + filter_clear_pred(&filter->preds[idx]); + err = filter_set_pred(&filter->preds[idx], pred, fn); if (err) return err; From f76690afd05e3e163149310bdcd30234f93b3a7a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 22:53:06 -0500 Subject: [PATCH 190/706] tracing/filter: Free pred array on disabling of filter When a filter is disabled, free the preds. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index b6c910642a1e..2f5458e244a3 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1388,6 +1388,10 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) if (!strcmp(strstrip(filter_string), "0")) { filter_disable_preds(call); + reset_preds(call->filter); + /* Make sure the filter is not being used */ + synchronize_sched(); + __free_preds(call->filter); remove_filter_string(call->filter); goto out_unlock; } From 61e9dea20e1ada886cc49a9ec6fe3c6ac0de7324 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 22:54:33 -0500 Subject: [PATCH 191/706] tracing/filter: Use a tree instead of stack for filter_match_preds() Currently the filter_match_preds() requires a stack to push and pop the preds to determine if the filter matches the record or not. This has two drawbacks: 1) It requires a stack to store state information. As this is done in fast paths we can't allocate the storage for this stack, and we can't use a global as it must be re-entrant. The stack is stored on the kernel stack and this greatly limits how many preds we may allow. 2) All conditions are calculated even when a short circuit exists. a || b will always calculate a and b even though a was determined to be true. Using a tree we can walk a constant structure that will save the state as we go. The algorithm is simply: pred = root; do { switch (move) { case MOVE_DOWN: if (OR or AND) { pred = left; continue; } if (pred == root) break; match = pred->fn(); pred = pred->parent; move = left child ? MOVE_UP_FROM_LEFT : MOVE_UP_FROM_RIGHT; continue; case MOVE_UP_FROM_LEFT: /* Only OR or AND can be a parent */ if (match && OR || !match && AND) { /* short circuit */ if (pred == root) break; pred = pred->parent; move = left child ? MOVE_UP_FROM_LEFT : MOVE_UP_FROM_RIGHT; continue; } pred = pred->right; move = MOVE_DOWN; continue; case MOVE_UP_FROM_RIGHT: if (pred == root) break; pred = pred->parent; move = left child ? MOVE_UP_FROM_LEFT : MOVE_UP_FROM_RIGHT; continue; } done = 1; } while (!done); This way there's no strict limit to how many preds we allow and it also will short circuit the logical operations when possible. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 9 +- kernel/trace/trace_events_filter.c | 231 +++++++++++++++++++++++------ 2 files changed, 194 insertions(+), 46 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 254d04a84ec3..bba34a72c780 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -664,6 +664,7 @@ struct event_filter { int n_preds; /* Number assigned */ int a_preds; /* allocated */ struct filter_pred *preds; + struct filter_pred *root; char *filter_string; }; @@ -675,6 +676,9 @@ struct event_subsystem { int nr_events; }; +#define FILTER_PRED_INVALID ((unsigned short)-1) +#define FILTER_PRED_IS_RIGHT (1 << 15) + struct filter_pred; struct regex; @@ -704,7 +708,10 @@ struct filter_pred { int offset; int not; int op; - int pop_n; + unsigned short index; + unsigned short parent; + unsigned short left; + unsigned short right; }; extern struct list_head ftrace_common_fields; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 2f5458e244a3..10390491b6d0 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -123,6 +123,11 @@ struct filter_parse_state { } operand; }; +struct pred_stack { + struct filter_pred **preds; + int index; +}; + #define DEFINE_COMPARISON_PRED(type) \ static int filter_pred_##type(struct filter_pred *pred, void *event) \ { \ @@ -357,52 +362,95 @@ static void filter_build_regex(struct filter_pred *pred) pred->not ^= not; } +enum move_type { + MOVE_DOWN, + MOVE_UP_FROM_LEFT, + MOVE_UP_FROM_RIGHT +}; + +static struct filter_pred * +get_pred_parent(struct filter_pred *pred, struct filter_pred *preds, + int index, enum move_type *move) +{ + if (pred->parent & FILTER_PRED_IS_RIGHT) + *move = MOVE_UP_FROM_RIGHT; + else + *move = MOVE_UP_FROM_LEFT; + pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT]; + + return pred; +} + /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct event_filter *filter, void *rec) { - int match = -1, top = 0, val1 = 0, val2 = 0; - int stack[MAX_FILTER_PRED]; + int match = -1; + enum move_type move = MOVE_DOWN; struct filter_pred *preds; struct filter_pred *pred; + struct filter_pred *root; int n_preds = ACCESS_ONCE(filter->n_preds); - int i; + int done = 0; /* no filter is considered a match */ if (!n_preds) return 1; /* - * n_preds and filter->preds is protect with preemption disabled. + * n_preds, root and filter->preds are protect with preemption disabled. */ preds = rcu_dereference_sched(filter->preds); + root = rcu_dereference_sched(filter->root); + if (!root) + return 1; - for (i = 0; i < n_preds; i++) { - pred = &preds[i]; - if (!pred->pop_n) { + pred = root; + + /* match is currently meaningless */ + match = -1; + + do { + switch (move) { + case MOVE_DOWN: + /* only AND and OR have children */ + if (pred->left != FILTER_PRED_INVALID) { + /* keep going to leaf node */ + pred = &preds[pred->left]; + continue; + } match = pred->fn(pred, rec); - stack[top++] = match; + /* If this pred is the only pred */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + /* Check for short circuits */ + if ((match && pred->op == OP_OR) || + (!match && pred->op == OP_AND)) { + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + /* now go down the right side of the tree. */ + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + /* We finished this equation. */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); continue; } - if (pred->pop_n > top) { - WARN_ON_ONCE(1); - return 0; - } - val1 = stack[--top]; - val2 = stack[--top]; - switch (pred->op) { - case OP_AND: - match = val1 && val2; - break; - case OP_OR: - match = val1 || val2; - break; - default: - WARN_ONCE(1, "filter op is not AND or OR"); - } - stack[top++] = match; - } + done = 1; + } while (!done); - return stack[--top]; + return match; } EXPORT_SYMBOL_GPL(filter_match_preds); @@ -539,10 +587,58 @@ static void filter_clear_pred(struct filter_pred *pred) pred->regex.len = 0; } -static int filter_set_pred(struct filter_pred *dest, +static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) +{ + stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); + if (!stack->preds) + return -ENOMEM; + stack->index = n_preds; + return 0; +} + +static void __free_pred_stack(struct pred_stack *stack) +{ + kfree(stack->preds); + stack->index = 0; +} + +static int __push_pred_stack(struct pred_stack *stack, + struct filter_pred *pred) +{ + int index = stack->index; + + if (WARN_ON(index == 0)) + return -ENOSPC; + + stack->preds[--index] = pred; + stack->index = index; + return 0; +} + +static struct filter_pred * +__pop_pred_stack(struct pred_stack *stack) +{ + struct filter_pred *pred; + int index = stack->index; + + pred = stack->preds[index++]; + if (!pred) + return NULL; + + stack->index = index; + return pred; +} + +static int filter_set_pred(struct event_filter *filter, + int idx, + struct pred_stack *stack, struct filter_pred *src, filter_pred_fn_t fn) { + struct filter_pred *dest = &filter->preds[idx]; + struct filter_pred *left; + struct filter_pred *right; + *dest = *src; if (src->field_name) { dest->field_name = kstrdup(src->field_name, GFP_KERNEL); @@ -550,8 +646,25 @@ static int filter_set_pred(struct filter_pred *dest, return -ENOMEM; } dest->fn = fn; + dest->index = idx; - return 0; + if (dest->op == OP_OR || dest->op == OP_AND) { + right = __pop_pred_stack(stack); + left = __pop_pred_stack(stack); + if (!left || !right) + return -EINVAL; + dest->left = left->index; + dest->right = right->index; + left->parent = dest->index; + right->parent = dest->index | FILTER_PRED_IS_RIGHT; + } else + /* + * Make dest->left invalid to be used as a quick + * way to know this is a leaf node. + */ + dest->left = FILTER_PRED_INVALID; + + return __push_pred_stack(stack, dest); } static void __free_preds(struct event_filter *filter) @@ -574,6 +687,7 @@ static void reset_preds(struct event_filter *filter) int i; filter->n_preds = 0; + filter->root = NULL; if (!filter->preds) return; @@ -707,6 +821,7 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, struct ftrace_event_call *call, struct event_filter *filter, struct filter_pred *pred, + struct pred_stack *stack, filter_pred_fn_t fn) { int idx, err; @@ -718,7 +833,7 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, idx = filter->n_preds; filter_clear_pred(&filter->preds[idx]); - err = filter_set_pred(&filter->preds[idx], pred, fn); + err = filter_set_pred(filter, idx, stack, pred, fn); if (err) return err; @@ -803,6 +918,7 @@ static int filter_add_pred(struct filter_parse_state *ps, struct ftrace_event_call *call, struct event_filter *filter, struct filter_pred *pred, + struct pred_stack *stack, bool dry_run) { struct ftrace_event_field *field; @@ -812,13 +928,10 @@ static int filter_add_pred(struct filter_parse_state *ps, fn = pred->fn = filter_pred_none; - if (pred->op == OP_AND) { - pred->pop_n = 2; + if (pred->op == OP_AND) goto add_pred_fn; - } else if (pred->op == OP_OR) { - pred->pop_n = 2; + else if (pred->op == OP_OR) goto add_pred_fn; - } field = find_event_field(call, pred->field_name); if (!field) { @@ -867,7 +980,7 @@ static int filter_add_pred(struct filter_parse_state *ps, add_pred_fn: if (!dry_run) - return filter_add_pred_fn(ps, call, filter, pred, fn); + return filter_add_pred_fn(ps, call, filter, pred, stack, fn); return 0; } @@ -1248,6 +1361,7 @@ static int replace_preds(struct ftrace_event_call *call, char *operand1 = NULL, *operand2 = NULL; struct filter_pred *pred; struct postfix_elt *elt; + struct pred_stack stack = { }; /* init to NULL */ int err; int n_preds = 0; @@ -1262,9 +1376,12 @@ static int replace_preds(struct ftrace_event_call *call, return err; if (!dry_run) { - err = __alloc_preds(filter, n_preds); + err = __alloc_pred_stack(&stack, n_preds); if (err) return err; + err = __alloc_preds(filter, n_preds); + if (err) + goto fail; } n_preds = 0; @@ -1276,14 +1393,16 @@ static int replace_preds(struct ftrace_event_call *call, operand2 = elt->operand; else { parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); - return -EINVAL; + err = -EINVAL; + goto fail; } continue; } if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); - return -ENOSPC; + err = -ENOSPC; + goto fail; } if (elt->op == OP_AND || elt->op == OP_OR) { @@ -1293,22 +1412,44 @@ static int replace_preds(struct ftrace_event_call *call, if (!operand1 || !operand2) { parse_error(ps, FILT_ERR_MISSING_FIELD, 0); - return -EINVAL; + err = -EINVAL; + goto fail; } pred = create_pred(elt->op, operand1, operand2); add_pred: - if (!pred) - return -ENOMEM; - err = filter_add_pred(ps, call, filter, pred, dry_run); + if (!pred) { + err = -ENOMEM; + goto fail; + } + err = filter_add_pred(ps, call, filter, pred, &stack, dry_run); filter_free_pred(pred); if (err) - return err; + goto fail; operand1 = operand2 = NULL; } - return 0; + if (!dry_run) { + /* We should have one item left on the stack */ + pred = __pop_pred_stack(&stack); + if (!pred) + return -EINVAL; + /* This item is where we start from in matching */ + filter->root = pred; + /* Make sure the stack is empty */ + pred = __pop_pred_stack(&stack); + if (WARN_ON(pred)) { + err = -EINVAL; + filter->root = NULL; + goto fail; + } + } + + err = 0; +fail: + __free_pred_stack(&stack); + return err; } static int replace_system_preds(struct event_subsystem *system, From 55719274188f13cff9e3bd11fdd4c0e7617cd03d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 23:12:05 -0500 Subject: [PATCH 192/706] tracing/filter: Optimize short ciruit check The test if we should break out early for OR and AND operations can be optimized by comparing the current result with (pred->op == OP_OR) That is if the result is true and the op is an OP_OR, or if the result is false and the op is not an OP_OR (thus an OP_AND) we can break out early in either case. Otherwise we continue processing. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 10390491b6d0..0a3e0502b507 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -426,9 +426,15 @@ int filter_match_preds(struct event_filter *filter, void *rec) pred->parent, &move); continue; case MOVE_UP_FROM_LEFT: - /* Check for short circuits */ - if ((match && pred->op == OP_OR) || - (!match && pred->op == OP_AND)) { + /* + * Check for short circuits. + * + * Optimization: !!match == (pred->op == OP_OR) + * is the same as: + * if ((match && pred->op == OP_OR) || + * (!match && pred->op == OP_AND)) + */ + if (!!match == (pred->op == OP_OR)) { if (pred == root) break; pred = get_pred_parent(pred, preds, From ec126cac23945de12eb2d103374e1f7ee97c5595 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 23:14:25 -0500 Subject: [PATCH 193/706] tracing/filter: Check the created pred tree Since the filter walks a tree to determine if a match is made or not, if the tree was incorrectly created, it could cause an infinite loop. Add a check to walk the entire tree before assigning it as a filter to make sure the tree is correct. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 72 +++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 0a3e0502b507..91c9cdcb040b 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1358,6 +1358,68 @@ static int count_preds(struct filter_parse_state *ps) return n_preds; } +/* + * The tree is walked at filtering of an event. If the tree is not correctly + * built, it may cause an infinite loop. Check here that the tree does + * indeed terminate. + */ +static int check_pred_tree(struct event_filter *filter, + struct filter_pred *root) +{ + struct filter_pred *preds; + struct filter_pred *pred; + enum move_type move = MOVE_DOWN; + int count = 0; + int done = 0; + int max; + + /* + * The max that we can hit a node is three times. + * Once going down, once coming up from left, and + * once coming up from right. This is more than enough + * since leafs are only hit a single time. + */ + max = 3 * filter->n_preds; + + preds = filter->preds; + if (!preds) + return -EINVAL; + pred = root; + + do { + if (WARN_ON(count++ > max)) + return -EINVAL; + + switch (move) { + case MOVE_DOWN: + if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + /* A leaf at the root is just a leaf in the tree */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + /* We are fine. */ + return 0; +} + static int replace_preds(struct ftrace_event_call *call, struct event_filter *filter, struct filter_parse_state *ps, @@ -1366,6 +1428,7 @@ static int replace_preds(struct ftrace_event_call *call, { char *operand1 = NULL, *operand2 = NULL; struct filter_pred *pred; + struct filter_pred *root; struct postfix_elt *elt; struct pred_stack stack = { }; /* init to NULL */ int err; @@ -1442,7 +1505,7 @@ add_pred: if (!pred) return -EINVAL; /* This item is where we start from in matching */ - filter->root = pred; + root = pred; /* Make sure the stack is empty */ pred = __pop_pred_stack(&stack); if (WARN_ON(pred)) { @@ -1450,6 +1513,13 @@ add_pred: filter->root = NULL; goto fail; } + err = check_pred_tree(filter, root); + if (err) + goto fail; + + /* We don't set root until we know it works */ + barrier(); + filter->root = root; } err = 0; From 43cd414552d8137157e926e46361678ea867e476 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 23:16:51 -0500 Subject: [PATCH 194/706] tracing/filter: Optimize filter by folding the tree There are many cases that a filter will contain multiple ORs or ANDs together near the leafs. Walking up and down the tree to get to the next compare can be a waste. If there are several ORs or ANDs together, fold them into a single pred and allocate an array of the conditions that they check. This will speed up the filter by linearly walking an array and can still break out if a short circuit condition is met. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 12 +- kernel/trace/trace_events_filter.c | 233 +++++++++++++++++++++++++++-- 2 files changed, 235 insertions(+), 10 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index bba34a72c780..d754330934bb 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -678,6 +678,7 @@ struct event_subsystem { #define FILTER_PRED_INVALID ((unsigned short)-1) #define FILTER_PRED_IS_RIGHT (1 << 15) +#define FILTER_PRED_FOLD (1 << 15) struct filter_pred; struct regex; @@ -704,7 +705,16 @@ struct filter_pred { filter_pred_fn_t fn; u64 val; struct regex regex; - char *field_name; + /* + * Leaf nodes use field_name, ops is used by AND and OR + * nodes. The field_name is always freed when freeing a pred. + * We can overload field_name for ops and have it freed + * as well. + */ + union { + char *field_name; + unsigned short *ops; + }; int offset; int not; int op; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 91c9cdcb040b..2403ce5b6507 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -381,6 +381,42 @@ get_pred_parent(struct filter_pred *pred, struct filter_pred *preds, return pred; } +/* + * A series of AND or ORs where found together. Instead of + * climbing up and down the tree branches, an array of the + * ops were made in order of checks. We can just move across + * the array and short circuit if needed. + */ +static int process_ops(struct filter_pred *preds, + struct filter_pred *op, void *rec) +{ + struct filter_pred *pred; + int type; + int match; + int i; + + /* + * Micro-optimization: We set type to true if op + * is an OR and false otherwise (AND). Then we + * just need to test if the match is equal to + * the type, and if it is, we can short circuit the + * rest of the checks: + * + * if ((match && op->op == OP_OR) || + * (!match && op->op == OP_AND)) + * return match; + */ + type = op->op == OP_OR; + + for (i = 0; i < op->val; i++) { + pred = &preds[op->ops[i]]; + match = pred->fn(pred, rec); + if (!!match == type) + return match; + } + return match; +} + /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct event_filter *filter, void *rec) { @@ -414,11 +450,16 @@ int filter_match_preds(struct event_filter *filter, void *rec) case MOVE_DOWN: /* only AND and OR have children */ if (pred->left != FILTER_PRED_INVALID) { - /* keep going to leaf node */ - pred = &preds[pred->left]; - continue; - } - match = pred->fn(pred, rec); + /* If ops is set, then it was folded. */ + if (!pred->ops) { + /* keep going to down the left side */ + pred = &preds[pred->left]; + continue; + } + /* We can treat folded ops as a leaf node */ + match = process_ops(preds, pred, rec); + } else + match = pred->fn(pred, rec); /* If this pred is the only pred */ if (pred == root) break; @@ -659,17 +700,34 @@ static int filter_set_pred(struct event_filter *filter, left = __pop_pred_stack(stack); if (!left || !right) return -EINVAL; - dest->left = left->index; - dest->right = right->index; - left->parent = dest->index; + /* + * If both children can be folded + * and they are the same op as this op or a leaf, + * then this op can be folded. + */ + if (left->index & FILTER_PRED_FOLD && + (left->op == dest->op || + left->left == FILTER_PRED_INVALID) && + right->index & FILTER_PRED_FOLD && + (right->op == dest->op || + right->left == FILTER_PRED_INVALID)) + dest->index |= FILTER_PRED_FOLD; + + dest->left = left->index & ~FILTER_PRED_FOLD; + dest->right = right->index & ~FILTER_PRED_FOLD; + left->parent = dest->index & ~FILTER_PRED_FOLD; right->parent = dest->index | FILTER_PRED_IS_RIGHT; - } else + } else { /* * Make dest->left invalid to be used as a quick * way to know this is a leaf node. */ dest->left = FILTER_PRED_INVALID; + /* All leafs allow folding the parent ops. */ + dest->index |= FILTER_PRED_FOLD; + } + return __push_pred_stack(stack, dest); } @@ -1420,6 +1478,158 @@ static int check_pred_tree(struct event_filter *filter, return 0; } +static int count_leafs(struct filter_pred *preds, struct filter_pred *root) +{ + struct filter_pred *pred; + enum move_type move = MOVE_DOWN; + int count = 0; + int done = 0; + + pred = root; + + do { + switch (move) { + case MOVE_DOWN: + if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + /* A leaf at the root is just a leaf in the tree */ + if (pred == root) + return 1; + count++; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + return count; +} + +static int fold_pred(struct filter_pred *preds, struct filter_pred *root) +{ + struct filter_pred *pred; + enum move_type move = MOVE_DOWN; + int count = 0; + int children; + int done = 0; + + /* No need to keep the fold flag */ + root->index &= ~FILTER_PRED_FOLD; + + /* If the root is a leaf then do nothing */ + if (root->left == FILTER_PRED_INVALID) + return 0; + + /* count the children */ + children = count_leafs(preds, &preds[root->left]); + children += count_leafs(preds, &preds[root->right]); + + root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL); + if (!root->ops) + return -ENOMEM; + + root->val = children; + + pred = root; + do { + switch (move) { + case MOVE_DOWN: + if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + if (WARN_ON(count == children)) + return -EINVAL; + pred->index &= ~FILTER_PRED_FOLD; + root->ops[count++] = pred->index; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + return 0; +} + +/* + * To optimize the processing of the ops, if we have several "ors" or + * "ands" together, we can put them in an array and process them all + * together speeding up the filter logic. + */ +static int fold_pred_tree(struct event_filter *filter, + struct filter_pred *root) +{ + struct filter_pred *preds; + struct filter_pred *pred; + enum move_type move = MOVE_DOWN; + int done = 0; + int err; + + preds = filter->preds; + if (!preds) + return -EINVAL; + pred = root; + + do { + switch (move) { + case MOVE_DOWN: + if (pred->index & FILTER_PRED_FOLD) { + err = fold_pred(preds, pred); + if (err) + return err; + /* Folded nodes are like leafs */ + } else if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + + /* A leaf at the root is just a leaf in the tree */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + return 0; +} + static int replace_preds(struct ftrace_event_call *call, struct event_filter *filter, struct filter_parse_state *ps, @@ -1517,6 +1727,11 @@ add_pred: if (err) goto fail; + /* Optimize the tree */ + err = fold_pred_tree(filter, root); + if (err) + goto fail; + /* We don't set root until we know it works */ barrier(); filter->root = root; From 4a3d27e98a7f2682e96d6f863752e0424b00d691 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 23:19:49 -0500 Subject: [PATCH 195/706] tracing/filter: Move MAX_FILTER_PRED to local tracing directory The MAX_FILTER_PRED is only needed by the kernel/trace/*.c files. Move it to kernel/trace/trace.h. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 1 - kernel/trace/trace.h | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 47e3997f7b5c..1a99e7939c2b 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -208,7 +208,6 @@ struct ftrace_event_call { #define PERF_MAX_TRACE_SIZE 2048 -#define MAX_FILTER_PRED 32 #define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */ extern void destroy_preds(struct ftrace_event_call *call); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d754330934bb..fbff872f8db1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -680,6 +680,8 @@ struct event_subsystem { #define FILTER_PRED_IS_RIGHT (1 << 15) #define FILTER_PRED_FOLD (1 << 15) +#define MAX_FILTER_PRED 32 + struct filter_pred; struct regex; From bf93f9ed3a2cb89eb7e58851139d3be375b98027 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 23:21:34 -0500 Subject: [PATCH 196/706] tracing/filter: Increase the max preds to 2^14 Now that the filter logic does not require to save the pred results on the stack, we can increase the max number of preds we allow. As the preds are index by a short value, and we use the MSBs as flags we can increase the max preds to 2^14 (16384) which should be way more than enough. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index fbff872f8db1..856e73cc1d3f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -680,7 +680,14 @@ struct event_subsystem { #define FILTER_PRED_IS_RIGHT (1 << 15) #define FILTER_PRED_FOLD (1 << 15) -#define MAX_FILTER_PRED 32 +/* + * The max preds is the size of unsigned short with + * two flags at the MSBs. One bit is used for both the IS_RIGHT + * and FOLD flags. The other is reserved. + * + * 2^14 preds is way more than enough. + */ +#define MAX_FILTER_PRED 16384 struct filter_pred; struct regex; From 75b8e98263fdb0bfbdeba60d4db463259f1fe8a2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Feb 2011 23:25:46 -0500 Subject: [PATCH 197/706] tracing/filter: Swap entire filter of events When creating a new filter, instead of allocating the filter to the event call first and then processing the filter, it is easier to process a temporary filter and then just swap it with the call filter. By doing this, it simplifies the code. A filter is allocated and processed, when it is done, it is swapped with the call filter, synchronize_sched() is called to make sure all callers are done with the old filter (filters are called with premption disabled), and then the old filter is freed. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 267 +++++++++++++++++------------ 1 file changed, 154 insertions(+), 113 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 2403ce5b6507..f5d335d28d0b 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -425,10 +425,15 @@ int filter_match_preds(struct event_filter *filter, void *rec) struct filter_pred *preds; struct filter_pred *pred; struct filter_pred *root; - int n_preds = ACCESS_ONCE(filter->n_preds); + int n_preds; int done = 0; /* no filter is considered a match */ + if (!filter) + return 1; + + n_preds = filter->n_preds; + if (!n_preds) return 1; @@ -509,6 +514,9 @@ static void parse_error(struct filter_parse_state *ps, int err, int pos) static void remove_filter_string(struct event_filter *filter) { + if (!filter) + return; + kfree(filter->filter_string); filter->filter_string = NULL; } @@ -568,9 +576,10 @@ static void append_filter_err(struct filter_parse_state *ps, void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) { - struct event_filter *filter = call->filter; + struct event_filter *filter; mutex_lock(&event_mutex); + filter = call->filter; if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else @@ -581,9 +590,10 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s) { - struct event_filter *filter = system->filter; + struct event_filter *filter; mutex_lock(&event_mutex); + filter = system->filter; if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else @@ -745,26 +755,9 @@ static void __free_preds(struct event_filter *filter) filter->n_preds = 0; } -static void reset_preds(struct event_filter *filter) +static void filter_disable(struct ftrace_event_call *call) { - int n_preds = filter->n_preds; - int i; - - filter->n_preds = 0; - filter->root = NULL; - if (!filter->preds) - return; - - for (i = 0; i < n_preds; i++) - filter->preds[i].fn = filter_pred_none; -} - -static void filter_disable_preds(struct ftrace_event_call *call) -{ - struct event_filter *filter = call->filter; - call->flags &= ~TRACE_EVENT_FL_FILTERED; - reset_preds(filter); } static void __free_filter(struct event_filter *filter) @@ -777,11 +770,16 @@ static void __free_filter(struct event_filter *filter) kfree(filter); } +/* + * Called when destroying the ftrace_event_call. + * The call is being freed, so we do not need to worry about + * the call being currently used. This is for module code removing + * the tracepoints from within it. + */ void destroy_preds(struct ftrace_event_call *call) { __free_filter(call->filter); call->filter = NULL; - call->flags &= ~TRACE_EVENT_FL_FILTERED; } static struct event_filter *__alloc_filter(void) @@ -789,11 +787,6 @@ static struct event_filter *__alloc_filter(void) struct event_filter *filter; filter = kzalloc(sizeof(*filter), GFP_KERNEL); - if (!filter) - return ERR_PTR(-ENOMEM); - - filter->n_preds = 0; - return filter; } @@ -838,36 +831,6 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) return 0; } -static int init_filter(struct ftrace_event_call *call) -{ - if (call->filter) - return 0; - - call->flags &= ~TRACE_EVENT_FL_FILTERED; - call->filter = __alloc_filter(); - if (IS_ERR(call->filter)) - return PTR_ERR(call->filter); - - return 0; -} - -static int init_subsystem_preds(struct event_subsystem *system) -{ - struct ftrace_event_call *call; - int err; - - list_for_each_entry(call, &ftrace_events, list) { - if (strcmp(call->class->system, system->name) != 0) - continue; - - err = init_filter(call); - if (err) - return err; - } - - return 0; -} - static void filter_free_subsystem_preds(struct event_subsystem *system) { struct ftrace_event_call *call; @@ -876,11 +839,23 @@ static void filter_free_subsystem_preds(struct event_subsystem *system) if (strcmp(call->class->system, system->name) != 0) continue; - filter_disable_preds(call); + filter_disable(call); remove_filter_string(call->filter); } } +static void filter_free_subsystem_filters(struct event_subsystem *system) +{ + struct ftrace_event_call *call; + + list_for_each_entry(call, &ftrace_events, list) { + if (strcmp(call->class->system, system->name) != 0) + continue; + __free_filter(call->filter); + call->filter = NULL; + } +} + static int filter_add_pred_fn(struct filter_parse_state *ps, struct ftrace_event_call *call, struct event_filter *filter, @@ -1743,88 +1718,129 @@ fail: return err; } +struct filter_list { + struct list_head list; + struct event_filter *filter; +}; + static int replace_system_preds(struct event_subsystem *system, struct filter_parse_state *ps, char *filter_string) { struct ftrace_event_call *call; + struct filter_list *filter_item; + struct filter_list *tmp; + LIST_HEAD(filter_list); bool fail = true; int err; list_for_each_entry(call, &ftrace_events, list) { - struct event_filter *filter = call->filter; if (strcmp(call->class->system, system->name) != 0) continue; - /* try to see if the filter can be applied */ - err = replace_preds(call, filter, ps, filter_string, true); + /* + * Try to see if the filter can be applied + * (filter arg is ignored on dry_run) + */ + err = replace_preds(call, NULL, ps, filter_string, true); if (err) goto fail; } - /* set all filter pred counts to zero */ list_for_each_entry(call, &ftrace_events, list) { - struct event_filter *filter = call->filter; + struct event_filter *filter; if (strcmp(call->class->system, system->name) != 0) continue; - reset_preds(filter); - } + filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); + if (!filter_item) + goto fail_mem; - /* - * Since some of the preds may be used under preemption - * we need to wait for them to finish before we may - * reallocate them. - */ - synchronize_sched(); + list_add_tail(&filter_item->list, &filter_list); - list_for_each_entry(call, &ftrace_events, list) { - struct event_filter *filter = call->filter; + filter_item->filter = __alloc_filter(); + if (!filter_item->filter) + goto fail_mem; + filter = filter_item->filter; - if (strcmp(call->class->system, system->name) != 0) - continue; - - /* really apply the filter */ - filter_disable_preds(call); - err = replace_preds(call, filter, ps, filter_string, false); + /* Can only fail on no memory */ + err = replace_filter_string(filter, filter_string); if (err) - filter_disable_preds(call); - else { + goto fail_mem; + + err = replace_preds(call, filter, ps, filter_string, false); + if (err) { + filter_disable(call); + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + append_filter_err(ps, filter); + } else call->flags |= TRACE_EVENT_FL_FILTERED; - replace_filter_string(filter, filter_string); - } + /* + * Regardless of if this returned an error, we still + * replace the filter for the call. + */ + filter = call->filter; + call->filter = filter_item->filter; + filter_item->filter = filter; + fail = false; } if (fail) goto fail; + /* + * The calls can still be using the old filters. + * Do a synchronize_sched() to ensure all calls are + * done with them before we free them. + */ + synchronize_sched(); + list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { + __free_filter(filter_item->filter); + list_del(&filter_item->list); + kfree(filter_item); + } return 0; fail: + /* No call succeeded */ + list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { + list_del(&filter_item->list); + kfree(filter_item); + } parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); return -EINVAL; + fail_mem: + /* If any call succeeded, we still need to sync */ + if (!fail) + synchronize_sched(); + list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { + __free_filter(filter_item->filter); + list_del(&filter_item->list); + kfree(filter_item); + } + return -ENOMEM; } int apply_event_filter(struct ftrace_event_call *call, char *filter_string) { - int err; struct filter_parse_state *ps; + struct event_filter *filter; + struct event_filter *tmp; + int err = 0; mutex_lock(&event_mutex); - err = init_filter(call); - if (err) - goto out_unlock; - if (!strcmp(strstrip(filter_string), "0")) { - filter_disable_preds(call); - reset_preds(call->filter); + filter_disable(call); + filter = call->filter; + if (!filter) + goto out_unlock; + call->filter = NULL; /* Make sure the filter is not being used */ synchronize_sched(); - __free_preds(call->filter); - remove_filter_string(call->filter); + __free_filter(filter); goto out_unlock; } @@ -1833,29 +1849,41 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) if (!ps) goto out_unlock; - filter_disable_preds(call); - replace_filter_string(call->filter, filter_string); + filter = __alloc_filter(); + if (!filter) { + kfree(ps); + goto out_unlock; + } + + replace_filter_string(filter, filter_string); parse_init(ps, filter_ops, filter_string); err = filter_parse(ps); if (err) { - append_filter_err(ps, call->filter); + append_filter_err(ps, filter); goto out; } - /* - * Make sure all the pred counts are zero so that - * no task is using it when we reallocate the preds array. - */ - reset_preds(call->filter); - synchronize_sched(); - - err = replace_preds(call, call->filter, ps, filter_string, false); - if (err) - append_filter_err(ps, call->filter); - else + err = replace_preds(call, filter, ps, filter_string, false); + if (err) { + filter_disable(call); + append_filter_err(ps, filter); + } else call->flags |= TRACE_EVENT_FL_FILTERED; out: + /* + * Always swap the call filter with the new filter + * even if there was an error. If there was an error + * in the filter, we disable the filter and show the error + * string + */ + tmp = call->filter; + call->filter = filter; + if (tmp) { + /* Make sure the call is done with the filter */ + synchronize_sched(); + __free_filter(tmp); + } filter_opstack_clear(ps); postfix_clear(ps); kfree(ps); @@ -1868,18 +1896,21 @@ out_unlock: int apply_subsystem_event_filter(struct event_subsystem *system, char *filter_string) { - int err; struct filter_parse_state *ps; + struct event_filter *filter; + int err = 0; mutex_lock(&event_mutex); - err = init_subsystem_preds(system); - if (err) - goto out_unlock; - if (!strcmp(strstrip(filter_string), "0")) { filter_free_subsystem_preds(system); remove_filter_string(system->filter); + filter = system->filter; + system->filter = NULL; + /* Ensure all filters are no longer used */ + synchronize_sched(); + filter_free_subsystem_filters(system); + __free_filter(filter); goto out_unlock; } @@ -1888,7 +1919,17 @@ int apply_subsystem_event_filter(struct event_subsystem *system, if (!ps) goto out_unlock; - replace_filter_string(system->filter, filter_string); + filter = __alloc_filter(); + if (!filter) + goto out; + + replace_filter_string(filter, filter_string); + /* + * No event actually uses the system filter + * we can free it without synchronize_sched(). + */ + __free_filter(system->filter); + system->filter = filter; parse_init(ps, filter_ops, filter_string); err = filter_parse(ps); @@ -1945,7 +1986,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, goto out_unlock; filter = __alloc_filter(); - if (IS_ERR(filter)) { + if (!filter) { err = PTR_ERR(filter); goto out_unlock; } From 4defe682d81a4960b6840ee4ed1a36f9db77c7bd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Feb 2011 23:29:06 -0500 Subject: [PATCH 198/706] tracing/filter: Remove synchronize_sched() from __alloc_preds() Because the filters are processed first and then activated (added to the call), we no longer need to worry about the preds of the filter in __alloc_preds() being used. As the filter that is allocating preds is not activated yet. Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 30 +++++++----------------------- 1 file changed, 7 insertions(+), 23 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index f5d335d28d0b..3249b4f77ef0 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -795,33 +795,17 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) struct filter_pred *pred; int i; - if (filter->preds) { - if (filter->a_preds < n_preds) { - /* - * We need to reallocate. - * We should have already have zeroed out - * the pred count and called synchronized_sched() - * to make sure no one is using the preds. - */ - if (WARN_ON_ONCE(filter->n_preds)) { - /* We need to reset it now */ - filter->n_preds = 0; - synchronize_sched(); - } - __free_preds(filter); - } - } + if (filter->preds) + __free_preds(filter); + + filter->preds = + kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL); - if (!filter->preds) { - filter->preds = - kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL); - filter->a_preds = n_preds; - } if (!filter->preds) return -ENOMEM; - if (WARN_ON(filter->a_preds < n_preds)) - return -EINVAL; + filter->a_preds = n_preds; + filter->n_preds = 0; for (i = 0; i < n_preds; i++) { pred = &filter->preds[i]; From ba976970c79fd2fbfe1a4b3b6766a318f4eb9d4c Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Thu, 3 Feb 2011 14:27:20 +1100 Subject: [PATCH 199/706] tracing/syscalls: Don't add events for unmapped syscalls FTRACE_SYSCALLS would create events for each and every system call, even if it had failed to map the system call's name with it's number. This resulted in a number of events being created that would not behave as expected. This could happen, for example, on architectures who's symbol names are unusual and will not match the system call name. It could also happen with system calls which were mapped to sys_ni_syscall. This patch changes the default system call number in the metadata to -1. If the system call name from the metadata is not successfully mapped to a system call number during boot, than the event initialisation routine will now return an error, preventing the event from being created. Signed-off-by: Ian Munsie LKML-Reference: <1296703645-18718-2-git-send-email-imunsie@au1.ibm.com> Signed-off-by: Steven Rostedt --- include/linux/syscalls.h | 2 ++ kernel/trace/trace_syscalls.c | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 98664db1be47..8e8968e74544 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -158,6 +158,7 @@ extern struct trace_event_functions exit_syscall_print_funcs; static struct syscall_metadata __used \ __syscall_meta_##sname = { \ .name = "sys"#sname, \ + .syscall_nr = -1, /* Filled in at boot */ \ .nb_args = nb, \ .types = types_##sname, \ .args = args_##sname, \ @@ -175,6 +176,7 @@ extern struct trace_event_functions exit_syscall_print_funcs; static struct syscall_metadata __used \ __syscall_meta__##sname = { \ .name = "sys_"#sname, \ + .syscall_nr = -1, /* Filled in at boot */ \ .nb_args = 0, \ .enter_event = &event_enter__##sname, \ .exit_event = &event_exit__##sname, \ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 5c9fe08d2093..a9ceabd52247 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -424,6 +424,14 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call) int init_syscall_trace(struct ftrace_event_call *call) { int id; + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + if (num < 0 || num >= NR_syscalls) { + pr_debug("syscall %s metadata not mapped, disabling ftrace event\n", + ((struct syscall_metadata *)call->data)->name); + return -ENOSYS; + } if (set_syscall_print_fmt(call) < 0) return -ENOMEM; From 3773b389b6927595512558594d040c1edba46f36 Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Thu, 3 Feb 2011 14:27:21 +1100 Subject: [PATCH 200/706] tracing/syscalls: Convert redundant syscall_nr checks into WARN_ON With the ftrace events now checking if the syscall_nr is valid upon initialisation it should no longer be possible to register or unregister a syscall event without a valid syscall_nr since they should not be created. This adds a WARN_ON_ONCE in the register and unregister functions to locate potential regressions in the future. Signed-off-by: Ian Munsie LKML-Reference: <1296703645-18718-3-git-send-email-imunsie@au1.ibm.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index a9ceabd52247..423094288fb5 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -359,7 +359,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call) int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; - if (num < 0 || num >= NR_syscalls) + if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return -ENOSYS; mutex_lock(&syscall_trace_lock); if (!sys_refcount_enter) @@ -377,7 +377,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call) int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; - if (num < 0 || num >= NR_syscalls) + if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return; mutex_lock(&syscall_trace_lock); sys_refcount_enter--; @@ -393,7 +393,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call) int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; - if (num < 0 || num >= NR_syscalls) + if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return -ENOSYS; mutex_lock(&syscall_trace_lock); if (!sys_refcount_exit) @@ -411,7 +411,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call) int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; - if (num < 0 || num >= NR_syscalls) + if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return; mutex_lock(&syscall_trace_lock); sys_refcount_exit--; From c763ba06bd9b5db2c46c36276c89103d92d2c604 Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Thu, 3 Feb 2011 14:27:22 +1100 Subject: [PATCH 201/706] tracing/syscalls: Make arch_syscall_addr weak Some architectures use non-trivial system call tables and will not work with the generic arch_syscall_addr code. For example, PowerPC64 uses a table of twin long longs. This patch makes the generic arch_syscall_addr weak to allow architectures with non-trivial system call tables to override it. Signed-off-by: Ian Munsie LKML-Reference: <1296703645-18718-4-git-send-email-imunsie@au1.ibm.com> Signed-off-by: Steven Rostedt --- Documentation/trace/ftrace-design.txt | 3 +++ kernel/trace/trace_syscalls.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/Documentation/trace/ftrace-design.txt b/Documentation/trace/ftrace-design.txt index dc52bd442c92..6fca17beee2f 100644 --- a/Documentation/trace/ftrace-design.txt +++ b/Documentation/trace/ftrace-design.txt @@ -247,6 +247,9 @@ You need very few things to get the syscalls tracing in an arch. - Support the TIF_SYSCALL_TRACEPOINT thread flags. - Put the trace_sys_enter() and trace_sys_exit() tracepoints calls from ptrace in the ptrace syscalls tracing path. +- If the system call table on this arch is more complicated than a simple array + of addresses of the system calls, implement an arch_syscall_addr to return + the address of a given system call. - Tag this arch as HAVE_SYSCALL_TRACEPOINTS. diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 423094288fb5..af831545f656 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -446,7 +446,7 @@ int init_syscall_trace(struct ftrace_event_call *call) return id; } -unsigned long __init arch_syscall_addr(int nr) +unsigned long __init __weak arch_syscall_addr(int nr) { return (unsigned long)sys_call_table[nr]; } From b2d55496818d64310b9f5486d4eea76ea614d7f8 Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Thu, 3 Feb 2011 14:27:23 +1100 Subject: [PATCH 202/706] tracing/syscalls: Allow arch specific syscall symbol matching Some architectures have unusual symbol names and the generic code to match the symbol name with the function name for the syscall metadata will fail. For example, symbols on PPC64 start with a period and the generic code will fail to match them. This patch moves the match logic out into a separate function which an arch can override by defining ARCH_HAS_SYSCALL_MATCH_SYM_NAME in asm/ftrace.h and implementing arch_syscall_match_sym_name. Signed-off-by: Ian Munsie LKML-Reference: <1296703645-18718-5-git-send-email-imunsie@au1.ibm.com> Signed-off-by: Steven Rostedt --- Documentation/trace/ftrace-design.txt | 4 ++++ kernel/trace/trace_syscalls.c | 21 ++++++++++++++------- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/Documentation/trace/ftrace-design.txt b/Documentation/trace/ftrace-design.txt index 6fca17beee2f..79fcafc7fd64 100644 --- a/Documentation/trace/ftrace-design.txt +++ b/Documentation/trace/ftrace-design.txt @@ -250,6 +250,10 @@ You need very few things to get the syscalls tracing in an arch. - If the system call table on this arch is more complicated than a simple array of addresses of the system calls, implement an arch_syscall_addr to return the address of a given system call. +- If the symbol names of the system calls do not match the function names on + this arch, define ARCH_HAS_SYSCALL_MATCH_SYM_NAME in asm/ftrace.h and + implement arch_syscall_match_sym_name with the appropriate logic to return + true if the function name corresponds with the symbol name. - Tag this arch as HAVE_SYSCALL_TRACEPOINTS. diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index af831545f656..86a23e7de031 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -60,6 +60,19 @@ extern struct syscall_metadata *__stop_syscalls_metadata[]; static struct syscall_metadata **syscalls_metadata; +#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME +static inline bool arch_syscall_match_sym_name(const char *sym, const char *name) +{ + /* + * Only compare after the "sys" prefix. Archs that use + * syscall wrappers may have syscalls symbols aliases prefixed + * with "SyS" instead of "sys", leading to an unwanted + * mismatch. + */ + return !strcmp(sym + 3, name + 3); +} +#endif + static __init struct syscall_metadata * find_syscall_meta(unsigned long syscall) { @@ -73,13 +86,7 @@ find_syscall_meta(unsigned long syscall) kallsyms_lookup(syscall, NULL, NULL, NULL, str); for ( ; start < stop; start++) { - /* - * Only compare after the "sys" prefix. Archs that use - * syscall wrappers may have syscalls symbols aliases prefixed - * with "SyS" instead of "sys", leading to an unwanted - * mismatch. - */ - if ((*start)->name && !strcmp((*start)->name + 3, str + 3)) + if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name)) return *start; } return NULL; From ae07f551c42d6e4162436ca452a199deac9dab4d Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Thu, 3 Feb 2011 14:27:25 +1100 Subject: [PATCH 203/706] tracing/syscalls: Early terminate search for sys_ni_syscall Many system calls are unimplemented and mapped to sys_ni_syscall, but at boot ftrace would still search through every syscall metadata entry for a match which wouldn't be there. This patch adds causes the search to terminate early if the system call is not mapped. Signed-off-by: Ian Munsie LKML-Reference: <1296703645-18718-7-git-send-email-imunsie@au1.ibm.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 86a23e7de031..ee7b5a0bb9f8 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -85,6 +85,9 @@ find_syscall_meta(unsigned long syscall) stop = __stop_syscalls_metadata; kallsyms_lookup(syscall, NULL, NULL, NULL, str); + if (arch_syscall_match_sym_name(str, "sys_ni_syscall")) + return NULL; + for ( ; start < stop; start++) { if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name)) return *start; From dc5f219e88294b93009eef946251251ffffb6d60 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 4 Feb 2011 13:19:20 +0100 Subject: [PATCH 204/706] genirq: Add IRQF_FORCE_RESUME Xen needs to reenable interrupts which are marked IRQF_NO_SUSPEND in the resume path. Add a flag to force the reenabling in the resume code. Tested-and-acked-by: Ian Campbell Signed-off-by: Thomas Gleixner --- include/linux/interrupt.h | 3 ++- kernel/irq/manage.c | 11 ++++++++++- kernel/irq/pm.c | 3 --- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 55e0d4253e49..d746da19c6a2 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -55,7 +55,7 @@ * Used by threaded interrupts which need to keep the * irq line disabled until the threaded handler has been run. * IRQF_NO_SUSPEND - Do not disable this IRQ during suspend - * + * IRQF_FORCE_RESUME - Force enable it on resume even if IRQF_NO_SUSPEND is set */ #define IRQF_DISABLED 0x00000020 #define IRQF_SAMPLE_RANDOM 0x00000040 @@ -67,6 +67,7 @@ #define IRQF_IRQPOLL 0x00001000 #define IRQF_ONESHOT 0x00002000 #define IRQF_NO_SUSPEND 0x00004000 +#define IRQF_FORCE_RESUME 0x00008000 #define IRQF_TIMER (__IRQF_TIMER | IRQF_NO_SUSPEND) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0caa59f747dd..b4198ee8cfdf 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -282,8 +282,17 @@ EXPORT_SYMBOL(disable_irq); void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) { - if (resume) + if (resume) { + if (!(desc->status & IRQ_SUSPENDED)) { + if (!desc->action) + return; + if (!(desc->action->flags & IRQF_FORCE_RESUME)) + return; + /* Pretend that it got disabled ! */ + desc->depth++; + } desc->status &= ~IRQ_SUSPENDED; + } switch (desc->depth) { case 0: diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 0d4005d85b03..d6bfb89cce91 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -53,9 +53,6 @@ void resume_device_irqs(void) for_each_irq_desc(irq, desc) { unsigned long flags; - if (!(desc->status & IRQ_SUSPENDED)) - continue; - raw_spin_lock_irqsave(&desc->lock, flags); __enable_irq(desc, irq, true); raw_spin_unlock_irqrestore(&desc->lock, flags); From 5e38ca8f3ea423442eaafe1b7e206084aa38120a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 2 Feb 2011 13:28:18 +0100 Subject: [PATCH 205/706] tracing: Add unstable sched clock note to the warning The warning "Delta way too big" warning might appear on a system with unstable shed clock right after the system is resumed and tracing was enabled during the suspend. Since it's not realy bug, and the unstable sched clock is working fast and reliable otherwise, Steven suggested to keep using the sched clock in any case and just to make note in the warning itself. Signed-off-by: Jiri Olsa LKML-Reference: <1296649698-6003-1-git-send-email-jolsa@redhat.com> Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bd1c35a4fbcc..7739893a1d0a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2163,10 +2163,14 @@ rb_reserve_next_event(struct ring_buffer *buffer, delta = diff; if (unlikely(test_time_stamp(delta))) { WARN_ONCE(delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", + KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", (unsigned long long)delta, (unsigned long long)ts, - (unsigned long long)cpu_buffer->write_stamp); + (unsigned long long)cpu_buffer->write_stamp, + sched_clock_stable ? "" : + "If you just came from a suspend/resume,\n" + "please switch to the trace global clock:\n" + " echo global > /sys/kernel/debug/tracing/trace_clock\n"); add_timestamp = 1; } } From e3087b80aa0bceda9863f33307460f3ba79f2b15 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 8 Feb 2011 15:01:39 -0200 Subject: [PATCH 206/706] perf annotate: Fix --stdio rendering The checks for not using a max_lines parameter were b0rked, problem introduced in 3653246. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 1012841835a3..6db435167d74 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -132,7 +132,7 @@ static int objdump_line__print(struct objdump_line *oline, if (percent < min_pcnt) return -1; - if (printed >= max_lines) + if (max_lines && printed >= max_lines) return 1; color = get_percent_color(percent); @@ -154,7 +154,7 @@ static int objdump_line__print(struct objdump_line *oline, color_fprintf(stdout, color, " %7.2f", percent); printf(" : "); color_fprintf(stdout, PERF_COLOR_BLUE, "%s\n", oline->line); - } else if (printed >= max_lines) + } else if (max_lines && printed >= max_lines) return 1; else { if (!*oline->line) From ce6f4fab4059cd72638a0cfa596a8ee2c79c1c8e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 8 Feb 2011 13:27:39 -0200 Subject: [PATCH 207/706] perf annotate: Move locking to struct annotation Since we'll need it when implementing the live annotate TUI browser. This also simplifies things a bit by having the list head for the source code to be in the dynamicly allocated part of struct annotation, that way we don't have to pass it around, it can be found from the struct symbol that is passed everywhere. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 14 +++-- tools/perf/builtin-report.c | 5 +- tools/perf/builtin-top.c | 67 ++++++++++---------- tools/perf/util/annotate.c | 85 +++++++++++++++----------- tools/perf/util/annotate.h | 34 ++++++----- tools/perf/util/hist.c | 5 +- tools/perf/util/hist.h | 3 +- tools/perf/util/top.h | 6 -- tools/perf/util/ui/browsers/annotate.c | 18 +++--- 9 files changed, 126 insertions(+), 111 deletions(-) diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index ea6a1165956f..427182953fd7 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -62,7 +62,8 @@ static int hists__add_entry(struct hists *self, struct addr_location *al) * All aggregated on the first sym_hist. */ struct annotation *notes = symbol__annotation(he->ms.sym); - if (notes->histograms == NULL && symbol__alloc_hist(he->ms.sym, 1) < 0) + if (notes->src == NULL && + symbol__alloc_hist(he->ms.sym, 1) < 0) return -ENOMEM; return hist_entry__inc_addr_samples(he, 0, al->addr); @@ -77,7 +78,8 @@ static int process_sample_event(union perf_event *event, { struct addr_location al; - if (perf_event__preprocess_sample(event, session, &al, sample, NULL) < 0) { + if (perf_event__preprocess_sample(event, session, &al, sample, + symbol__annotate_init) < 0) { pr_warning("problem processing %d event, skipping it.\n", event->header.type); return -1; @@ -111,7 +113,7 @@ static void hists__find_annotations(struct hists *self) goto find_next; notes = symbol__annotation(he->ms.sym); - if (notes->histograms == NULL) { + if (notes->src == NULL) { find_next: if (key == KEY_LEFT) nd = rb_prev(nd); @@ -142,11 +144,11 @@ find_next: nd = rb_next(nd); /* * Since we have a hist_entry per IP for the same - * symbol, free he->ms.sym->histogram to signal we already + * symbol, free he->ms.sym->src to signal we already * processed this symbol. */ - free(notes->histograms); - notes->histograms = NULL; + free(notes->src); + notes->src = NULL; } } } diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index de06bf55efff..f403aced4cba 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -123,7 +123,7 @@ static int perf_session__add_hist_entry(struct perf_session *session, * All aggregated on the first sym_hist. */ struct annotation *notes = symbol__annotation(he->ms.sym); - if (notes->histograms == NULL && + if (notes->src == NULL && symbol__alloc_hist(he->ms.sym, 1) < 0) err = -ENOMEM; else @@ -166,7 +166,8 @@ static int process_sample_event(union perf_event *event, struct addr_location al; struct perf_event_attr *attr; - if (perf_event__preprocess_sample(event, session, &al, sample, NULL) < 0) { + if (perf_event__preprocess_sample(event, session, &al, sample, + symbol__annotate_init) < 0) { fprintf(stderr, "problem processing %d event, skipping it.\n", event->header.type); return -1; diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index b790673cb0aa..7dbf22d096b8 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -139,7 +139,7 @@ static void sig_winch_handler(int sig __used) static int parse_source(struct sym_entry *syme) { struct symbol *sym; - struct sym_entry_source *source; + struct annotation *notes; struct map *map; int err = -1; @@ -152,39 +152,35 @@ static int parse_source(struct sym_entry *syme) /* * We can't annotate with just /proc/kallsyms */ - if (map->dso->origin == DSO__ORIG_KERNEL) + if (map->dso->origin == DSO__ORIG_KERNEL) { + pr_err("Can't annotate %s: No vmlinux file was found in the " + "path\n", sym->name); + sleep(1); return -1; - - if (syme->src == NULL) { - syme->src = zalloc(sizeof(*source)); - if (syme->src == NULL) - return -1; - pthread_mutex_init(&syme->src->lock, NULL); - INIT_LIST_HEAD(&syme->src->head); } - source = syme->src; - - if (symbol__annotation(sym)->histograms != NULL) { - pthread_mutex_lock(&source->lock); + notes = symbol__annotation(sym); + if (notes->src != NULL) { + pthread_mutex_lock(¬es->lock); goto out_assign; } - pthread_mutex_lock(&source->lock); + pthread_mutex_lock(¬es->lock); if (symbol__alloc_hist(sym, top.evlist->nr_entries) < 0) { pr_err("Not enough memory for annotating '%s' symbol!\n", sym->name); + sleep(1); goto out_unlock; } - err = symbol__annotate(sym, syme->map, &source->head, 0); + err = symbol__annotate(sym, syme->map, 0); if (err == 0) { out_assign: sym_filter_entry = syme; } out_unlock: - pthread_mutex_unlock(&source->lock); + pthread_mutex_unlock(¬es->lock); return err; } @@ -196,20 +192,27 @@ static void __zero_source_counters(struct sym_entry *syme) static void record_precise_ip(struct sym_entry *syme, int counter, u64 ip) { + struct annotation *notes; + struct symbol *sym; + if (syme != sym_filter_entry) return; - if (pthread_mutex_trylock(&syme->src->lock)) + sym = sym_entry__symbol(syme); + notes = symbol__annotation(sym); + + if (pthread_mutex_trylock(¬es->lock)) return; ip = syme->map->map_ip(syme->map, ip); - symbol__inc_addr_samples(sym_entry__symbol(syme), syme->map, counter, ip); + symbol__inc_addr_samples(sym, syme->map, counter, ip); - pthread_mutex_unlock(&syme->src->lock); + pthread_mutex_unlock(¬es->lock); } static void show_details(struct sym_entry *syme) { + struct annotation *notes; struct symbol *symbol; int more; @@ -217,24 +220,26 @@ static void show_details(struct sym_entry *syme) return; symbol = sym_entry__symbol(syme); - if (!syme->src || symbol__annotation(symbol)->histograms == NULL) - return; + notes = symbol__annotation(symbol); + + pthread_mutex_lock(¬es->lock); + + if (notes->src == NULL) + goto out_unlock; printf("Showing %s for %s\n", event_name(top.sym_evsel), symbol->name); printf(" Events Pcnt (>=%d%%)\n", sym_pcnt_filter); - pthread_mutex_lock(&syme->src->lock); - more = symbol__annotate_printf(symbol, syme->map, &syme->src->head, - top.sym_evsel->idx, 0, sym_pcnt_filter, - top.print_entries); + more = symbol__annotate_printf(symbol, syme->map, top.sym_evsel->idx, + 0, sym_pcnt_filter, top.print_entries); if (top.zero) symbol__annotate_zero_histogram(symbol, top.sym_evsel->idx); else - symbol__annotate_decay_histogram(symbol, &syme->src->head, - top.sym_evsel->idx); - pthread_mutex_unlock(&syme->src->lock); + symbol__annotate_decay_histogram(symbol, top.sym_evsel->idx); if (more != 0) printf("%d lines not displayed, maybe increase display entries [e]\n", more); +out_unlock: + pthread_mutex_unlock(¬es->lock); } static const char CONSOLE_CLEAR[] = ""; @@ -372,10 +377,8 @@ static void prompt_symbol(struct sym_entry **target, const char *msg) /* zero counters of active symbol */ if (syme) { - pthread_mutex_lock(&syme->src->lock); __zero_source_counters(syme); *target = NULL; - pthread_mutex_unlock(&syme->src->lock); } fprintf(stdout, "\n%s: ", msg); @@ -554,10 +557,8 @@ static void handle_keypress(struct perf_session *session, int c) else { struct sym_entry *syme = sym_filter_entry; - pthread_mutex_lock(&syme->src->lock); sym_filter_entry = NULL; __zero_source_counters(syme); - pthread_mutex_unlock(&syme->src->lock); } break; case 'U': @@ -653,7 +654,7 @@ static int symbol_filter(struct map *map, struct symbol *sym) syme = symbol__priv(sym); syme->map = map; - syme->src = NULL; + symbol__annotate_init(map, sym); if (!sym_filter_entry && sym_filter && !strcmp(name, sym_filter)) { /* schedule initial sym_filter_entry setup */ diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 6db435167d74..c777bdaf91da 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -14,25 +14,39 @@ #include "symbol.h" #include "debug.h" #include "annotate.h" +#include + +int symbol__annotate_init(struct map *map __used, struct symbol *sym) +{ + struct annotation *notes = symbol__annotation(sym); + pthread_mutex_init(¬es->lock, NULL); + return 0; +} int symbol__alloc_hist(struct symbol *sym, int nevents) { struct annotation *notes = symbol__annotation(sym); - - notes->sizeof_sym_hist = (sizeof(*notes->histograms) + + size_t sizeof_sym_hist = (sizeof(struct sym_hist) + (sym->end - sym->start) * sizeof(u64)); - notes->histograms = calloc(nevents, notes->sizeof_sym_hist); - notes->nr_histograms = nevents; - return notes->histograms == NULL ? -1 : 0; + + notes->src = zalloc(sizeof(*notes->src) + nevents * sizeof_sym_hist); + if (notes->src == NULL) + return -1; + notes->src->sizeof_sym_hist = sizeof_sym_hist; + notes->src->nr_histograms = nevents; + INIT_LIST_HEAD(¬es->src->source); + return 0; } void symbol__annotate_zero_histograms(struct symbol *sym) { struct annotation *notes = symbol__annotation(sym); - if (notes->histograms != NULL) - memset(notes->histograms, 0, - notes->nr_histograms * notes->sizeof_sym_hist); + pthread_mutex_lock(¬es->lock); + if (notes->src != NULL) + memset(notes->src->histograms, 0, + notes->src->nr_histograms * notes->src->sizeof_sym_hist); + pthread_mutex_unlock(¬es->lock); } int symbol__inc_addr_samples(struct symbol *sym, struct map *map, @@ -43,7 +57,7 @@ int symbol__inc_addr_samples(struct symbol *sym, struct map *map, struct sym_hist *h; notes = symbol__annotation(sym); - if (notes->histograms == NULL) + if (notes->src == NULL) return -ENOMEM; pr_debug3("%s: addr=%#" PRIx64 "\n", __func__, map->unmap_ip(map, addr)); @@ -95,8 +109,7 @@ struct objdump_line *objdump__get_next_ip_line(struct list_head *head, return NULL; } -static int objdump_line__print(struct objdump_line *oline, - struct list_head *head, struct symbol *sym, +static int objdump_line__print(struct objdump_line *oline, struct symbol *sym, int evidx, u64 len, int min_pcnt, int printed, int max_lines) { @@ -109,10 +122,12 @@ static int objdump_line__print(struct objdump_line *oline, double percent = 0.0; const char *color; struct annotation *notes = symbol__annotation(sym); - struct source_line *src_line = notes->src_line; + struct source_line *src_line = notes->src->lines; struct sym_hist *h = annotation__histogram(notes, evidx); s64 offset = oline->offset; - struct objdump_line *next = objdump__get_next_ip_line(head, oline); + struct objdump_line *next; + + next = objdump__get_next_ip_line(¬es->src->source, oline); while (offset < (s64)len && (next == NULL || offset < next->offset)) { @@ -166,9 +181,10 @@ static int objdump_line__print(struct objdump_line *oline, return 0; } -static int symbol__parse_objdump_line(struct symbol *sym, struct map *map, FILE *file, - struct list_head *head, size_t privsize) +static int symbol__parse_objdump_line(struct symbol *sym, struct map *map, + FILE *file, size_t privsize) { + struct annotation *notes = symbol__annotation(sym); struct objdump_line *objdump_line; char *line = NULL, *tmp, *tmp2, *c; size_t line_len; @@ -222,13 +238,12 @@ static int symbol__parse_objdump_line(struct symbol *sym, struct map *map, FILE free(line); return -1; } - objdump__add_line(head, objdump_line); + objdump__add_line(¬es->src->source, objdump_line); return 0; } -int symbol__annotate(struct symbol *sym, struct map *map, - struct list_head *head, size_t privsize) +int symbol__annotate(struct symbol *sym, struct map *map, size_t privsize) { struct dso *dso = map->dso; char *filename = dso__build_id_filename(dso, NULL, 0); @@ -297,7 +312,7 @@ fallback: goto out_free_filename; while (!feof(file)) - if (symbol__parse_objdump_line(sym, map, file, head, privsize) < 0) + if (symbol__parse_objdump_line(sym, map, file, privsize) < 0) break; pclose(file); @@ -330,14 +345,14 @@ static void insert_source_line(struct rb_root *root, struct source_line *src_lin static void symbol__free_source_line(struct symbol *sym, int len) { struct annotation *notes = symbol__annotation(sym); - struct source_line *src_line = notes->src_line; + struct source_line *src_line = notes->src->lines; int i; for (i = 0; i < len; i++) free(src_line[i].path); free(src_line); - notes->src_line = NULL; + notes->src->lines = NULL; } /* Get the filename:line for the colored entries */ @@ -355,8 +370,8 @@ static int symbol__get_source_line(struct symbol *sym, struct map *map, if (!h->sum) return 0; - src_line = notes->src_line = calloc(len, sizeof(struct source_line)); - if (!notes->src_line) + src_line = notes->src->lines = calloc(len, sizeof(struct source_line)); + if (!notes->src->lines) return -1; start = map->unmap_ip(map, sym->start); @@ -436,12 +451,12 @@ static void symbol__annotate_hits(struct symbol *sym, int evidx) printf("%*s: %" PRIu64 "\n", BITS_PER_LONG / 2, "h->sum", h->sum); } -int symbol__annotate_printf(struct symbol *sym, struct map *map, - struct list_head *head, int evidx, bool full_paths, - int min_pcnt, int max_lines) +int symbol__annotate_printf(struct symbol *sym, struct map *map, int evidx, + bool full_paths, int min_pcnt, int max_lines) { struct dso *dso = map->dso; const char *filename = dso->long_name, *d_filename; + struct annotation *notes = symbol__annotation(sym); struct objdump_line *pos; int printed = 2; int more = 0; @@ -460,8 +475,8 @@ int symbol__annotate_printf(struct symbol *sym, struct map *map, if (verbose) symbol__annotate_hits(sym, evidx); - list_for_each_entry(pos, head, node) { - switch (objdump_line__print(pos, head, sym, evidx, len, min_pcnt, + list_for_each_entry(pos, ¬es->src->source, node) { + switch (objdump_line__print(pos, sym, evidx, len, min_pcnt, printed, max_lines)) { case 0: ++printed; @@ -485,11 +500,10 @@ void symbol__annotate_zero_histogram(struct symbol *sym, int evidx) struct annotation *notes = symbol__annotation(sym); struct sym_hist *h = annotation__histogram(notes, evidx); - memset(h, 0, notes->sizeof_sym_hist); + memset(h, 0, notes->src->sizeof_sym_hist); } -void symbol__annotate_decay_histogram(struct symbol *sym, - struct list_head *head, int evidx) +void symbol__annotate_decay_histogram(struct symbol *sym, int evidx) { struct annotation *notes = symbol__annotation(sym); struct sym_hist *h = annotation__histogram(notes, evidx); @@ -497,7 +511,7 @@ void symbol__annotate_decay_histogram(struct symbol *sym, h->sum = 0; - list_for_each_entry(pos, head, node) { + list_for_each_entry(pos, ¬es->src->source, node) { if (pos->offset != -1) { h->addr[pos->offset] = h->addr[pos->offset] * 7 / 8; h->sum += h->addr[pos->offset]; @@ -522,10 +536,9 @@ int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx, struct dso *dso = map->dso; const char *filename = dso->long_name; struct rb_root source_line = RB_ROOT; - LIST_HEAD(head); u64 len; - if (symbol__annotate(sym, map, &head, 0) < 0) + if (symbol__annotate(sym, map, 0) < 0) return -1; len = sym->end - sym->start; @@ -536,12 +549,12 @@ int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx, print_summary(&source_line, filename); } - symbol__annotate_printf(sym, map, &head, evidx, full_paths, + symbol__annotate_printf(sym, map, evidx, full_paths, min_pcnt, max_lines); if (print_lines) symbol__free_source_line(sym, len); - objdump_line_list__purge(&head); + objdump_line_list__purge(&symbol__annotation(sym)->src->source); return 0; } diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index bc08b36a713a..b237c8678c22 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -28,22 +28,29 @@ struct source_line { char *path; }; -/** struct annotation - symbols with hits have this attached as in sannotation +/** struct annotated_source - symbols with hits have this attached as in sannotation * * @histogram: Array of addr hit histograms per event being monitored - * @src_line: If 'print_lines' is specified, per source code line percentages + * @lines: If 'print_lines' is specified, per source code line percentages + * @source: source parsed from objdump -dS * - * src_line is allocated, percentages calculated and all sorted by percentage + * lines is allocated, percentages calculated and all sorted by percentage * when the annotation is about to be presented, so the percentages are for * one of the entries in the histogram array, i.e. for the event/counter being * presented. It is deallocated right after symbol__{tui,tty,etc}_annotate * returns. */ -struct annotation { - struct source_line *src_line; - struct sym_hist *histograms; +struct annotated_source { + struct list_head source; + struct source_line *lines; int nr_histograms; int sizeof_sym_hist; + struct sym_hist histograms[0]; +}; + +struct annotation { + pthread_mutex_t lock; + struct annotated_source *src; }; struct sannotation { @@ -53,7 +60,8 @@ struct sannotation { static inline struct sym_hist *annotation__histogram(struct annotation *notes, int idx) { - return ((void *)notes->histograms) + (notes->sizeof_sym_hist * idx); + return (((void *)¬es->src->histograms) + + (notes->src->sizeof_sym_hist * idx)); } static inline struct annotation *symbol__annotation(struct symbol *sym) @@ -67,14 +75,12 @@ int symbol__inc_addr_samples(struct symbol *sym, struct map *map, int symbol__alloc_hist(struct symbol *sym, int nevents); void symbol__annotate_zero_histograms(struct symbol *sym); -int symbol__annotate(struct symbol *sym, struct map *map, - struct list_head *head, size_t privsize); -int symbol__annotate_printf(struct symbol *sym, struct map *map, - struct list_head *head, int evidx, bool full_paths, - int min_pcnt, int max_lines); +int symbol__annotate(struct symbol *sym, struct map *map, size_t privsize); +int symbol__annotate_init(struct map *map __used, struct symbol *sym); +int symbol__annotate_printf(struct symbol *sym, struct map *map, int evidx, + bool full_paths, int min_pcnt, int max_lines); void symbol__annotate_zero_histogram(struct symbol *sym, int evidx); -void symbol__annotate_decay_histogram(struct symbol *sym, - struct list_head *head, int evidx); +void symbol__annotate_decay_histogram(struct symbol *sym, int evidx); void objdump_line_list__purge(struct list_head *head); int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx, diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index bac5ab684967..3f437236f193 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -955,10 +955,9 @@ int hist_entry__inc_addr_samples(struct hist_entry *he, int evidx, u64 ip) return symbol__inc_addr_samples(he->ms.sym, he->ms.map, evidx, ip); } -int hist_entry__annotate(struct hist_entry *he, struct list_head *head, - size_t privsize) +int hist_entry__annotate(struct hist_entry *he, size_t privsize) { - return symbol__annotate(he->ms.sym, he->ms.map, head, privsize); + return symbol__annotate(he->ms.sym, he->ms.map, privsize); } void hists__inc_nr_events(struct hists *self, u32 type) diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 2c6cdae6a764..37c79089de09 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -78,8 +78,7 @@ size_t hists__fprintf(struct hists *self, struct hists *pair, bool show_displacement, FILE *fp); int hist_entry__inc_addr_samples(struct hist_entry *self, int evidx, u64 addr); -int hist_entry__annotate(struct hist_entry *self, struct list_head *head, - size_t privsize); +int hist_entry__annotate(struct hist_entry *self, size_t privsize); void hists__filter_by_dso(struct hists *self, const struct dso *dso); void hists__filter_by_thread(struct hists *self, const struct thread *thread); diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h index 62e32939f3d1..4f769f47e19a 100644 --- a/tools/perf/util/top.h +++ b/tools/perf/util/top.h @@ -11,11 +11,6 @@ struct perf_evlist; struct perf_evsel; -struct sym_entry_source { - struct list_head head; - pthread_mutex_t lock; -}; - struct sym_entry { struct rb_node rb_node; struct list_head node; @@ -24,7 +19,6 @@ struct sym_entry { int skip; u8 origin; struct map *map; - struct sym_entry_source *src; unsigned long count[0]; }; diff --git a/tools/perf/util/ui/browsers/annotate.c b/tools/perf/util/ui/browsers/annotate.c index 8d8a16895af7..1aa39658539c 100644 --- a/tools/perf/util/ui/browsers/annotate.c +++ b/tools/perf/util/ui/browsers/annotate.c @@ -60,7 +60,6 @@ static void annotate_browser__write(struct ui_browser *self, void *entry, int ro } static double objdump_line__calc_percent(struct objdump_line *self, - struct list_head *head, struct symbol *sym, int evidx) { double percent = 0.0; @@ -69,11 +68,12 @@ static double objdump_line__calc_percent(struct objdump_line *self, int len = sym->end - sym->start; unsigned int hits = 0; struct annotation *notes = symbol__annotation(sym); - struct source_line *src_line = notes->src_line; + struct source_line *src_line = notes->src->lines; struct sym_hist *h = annotation__histogram(notes, evidx); s64 offset = self->offset; - struct objdump_line *next = objdump__get_next_ip_line(head, self); + struct objdump_line *next; + next = objdump__get_next_ip_line(¬es->src->source, self); while (offset < (s64)len && (next == NULL || offset < next->offset)) { if (src_line) { @@ -192,10 +192,10 @@ int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx) { struct objdump_line *pos, *n; struct objdump_line_rb_node *rbpos; - LIST_HEAD(head); + struct annotation *notes = symbol__annotation(sym); struct annotate_browser browser = { .b = { - .entries = &head, + .entries = ¬es->src->source, .refresh = ui_browser__list_head_refresh, .seek = ui_browser__list_head_seek, .write = annotate_browser__write, @@ -210,20 +210,20 @@ int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx) if (map->dso->annotate_warned) return -1; - if (symbol__annotate(sym, map, &head, sizeof(*rbpos)) < 0) { + if (symbol__annotate(sym, map, sizeof(*rbpos)) < 0) { ui__error_window(ui_helpline__last_msg); return -1; } ui_helpline__push("Press <- or ESC to exit"); - list_for_each_entry(pos, &head, node) { + list_for_each_entry(pos, ¬es->src->source, node) { size_t line_len = strlen(pos->line); if (browser.b.width < line_len) browser.b.width = line_len; rbpos = objdump_line__rb(pos); rbpos->idx = browser.b.nr_entries++; - rbpos->percent = objdump_line__calc_percent(pos, &head, sym, evidx); + rbpos->percent = objdump_line__calc_percent(pos, sym, evidx); if (rbpos->percent < 0.01) continue; objdump__insert_line(&browser.entries, rbpos); @@ -238,7 +238,7 @@ int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx) browser.b.width += 18; /* Percentage */ ret = annotate_browser__run(&browser); - list_for_each_entry_safe(pos, n, &head, node) { + list_for_each_entry_safe(pos, n, ¬es->src->source, node) { list_del(&pos->node); objdump_line__free(pos); } From d5e3d747007fdb541e57ed72e020ff0b94db3470 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 8 Feb 2011 15:29:25 -0200 Subject: [PATCH 208/706] perf annotate: Fix annotate context lines regression The live annotation done in 'perf top' needs to limit the context before lines that aren't filtered out by the min percent filter, if we don't do that, the screen in a tty often is not enough for showing what is interesting: lines with hits and a few source code lines before it. Reported-by: Mike Galbraith Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-top.c | 2 +- tools/perf/util/annotate.c | 47 ++++++++++++++++++++++++++++++++------ tools/perf/util/annotate.h | 3 ++- 3 files changed, 43 insertions(+), 9 deletions(-) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 7dbf22d096b8..210c736e6db4 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -231,7 +231,7 @@ static void show_details(struct sym_entry *syme) printf(" Events Pcnt (>=%d%%)\n", sym_pcnt_filter); more = symbol__annotate_printf(symbol, syme->map, top.sym_evsel->idx, - 0, sym_pcnt_filter, top.print_entries); + 0, sym_pcnt_filter, top.print_entries, 4); if (top.zero) symbol__annotate_zero_histogram(symbol, top.sym_evsel->idx); else diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index c777bdaf91da..02976b895f27 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -111,7 +111,8 @@ struct objdump_line *objdump__get_next_ip_line(struct list_head *head, static int objdump_line__print(struct objdump_line *oline, struct symbol *sym, int evidx, u64 len, int min_pcnt, - int printed, int max_lines) + int printed, int max_lines, + struct objdump_line *queue) { static const char *prev_line; static const char *prev_color; @@ -150,6 +151,15 @@ static int objdump_line__print(struct objdump_line *oline, struct symbol *sym, if (max_lines && printed >= max_lines) return 1; + if (queue != NULL) { + list_for_each_entry_from(queue, ¬es->src->source, node) { + if (queue == oline) + break; + objdump_line__print(queue, sym, evidx, len, + 0, 0, 1, NULL); + } + } + color = get_percent_color(percent); /* @@ -172,6 +182,9 @@ static int objdump_line__print(struct objdump_line *oline, struct symbol *sym, } else if (max_lines && printed >= max_lines) return 1; else { + if (queue) + return -1; + if (!*oline->line) printf(" :\n"); else @@ -452,13 +465,14 @@ static void symbol__annotate_hits(struct symbol *sym, int evidx) } int symbol__annotate_printf(struct symbol *sym, struct map *map, int evidx, - bool full_paths, int min_pcnt, int max_lines) + bool full_paths, int min_pcnt, int max_lines, + int context) { struct dso *dso = map->dso; const char *filename = dso->long_name, *d_filename; struct annotation *notes = symbol__annotation(sym); - struct objdump_line *pos; - int printed = 2; + struct objdump_line *pos, *queue = NULL; + int printed = 2, queue_len = 0; int more = 0; u64 len; @@ -476,10 +490,20 @@ int symbol__annotate_printf(struct symbol *sym, struct map *map, int evidx, symbol__annotate_hits(sym, evidx); list_for_each_entry(pos, ¬es->src->source, node) { + if (context && queue == NULL) { + queue = pos; + queue_len = 0; + } + switch (objdump_line__print(pos, sym, evidx, len, min_pcnt, - printed, max_lines)) { + printed, max_lines, queue)) { case 0: ++printed; + if (context) { + printed += queue_len; + queue = NULL; + queue_len = 0; + } break; case 1: /* filtered by max_lines */ @@ -487,7 +511,16 @@ int symbol__annotate_printf(struct symbol *sym, struct map *map, int evidx, break; case -1: default: - /* filtered by min_pcnt */ + /* + * Filtered by min_pcnt or non IP lines when + * context != 0 + */ + if (!context) + break; + if (queue_len == context) + queue = list_entry(queue->node.next, typeof(*queue), node); + else + ++queue_len; break; } } @@ -550,7 +583,7 @@ int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx, } symbol__annotate_printf(sym, map, evidx, full_paths, - min_pcnt, max_lines); + min_pcnt, max_lines, 0); if (print_lines) symbol__free_source_line(sym, len); diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index b237c8678c22..e848803fcd48 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -78,7 +78,8 @@ void symbol__annotate_zero_histograms(struct symbol *sym); int symbol__annotate(struct symbol *sym, struct map *map, size_t privsize); int symbol__annotate_init(struct map *map __used, struct symbol *sym); int symbol__annotate_printf(struct symbol *sym, struct map *map, int evidx, - bool full_paths, int min_pcnt, int max_lines); + bool full_paths, int min_pcnt, int max_lines, + int context); void symbol__annotate_zero_histogram(struct symbol *sym, int evidx); void symbol__annotate_decay_histogram(struct symbol *sym, int evidx); void objdump_line_list__purge(struct list_head *head); From c305d524e5dd3c3c7a6035083e30950bea1b52dc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 2 Feb 2011 17:10:48 +0100 Subject: [PATCH 209/706] softirq: Avoid stack switch from ksoftirqd ksoftirqd() calls do_softirq() which switches stacks on several architectures. That makes no sense at all. ksoftirqd's stack is sufficient. Call __do_softirq() directly. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Benjamin Herrenschmidt Cc: Heiko Carstens Acked-by: David Miller Cc: Paul Mundt Reviewed-by: Frank Rowand LKML-Reference: --- kernel/softirq.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/softirq.c b/kernel/softirq.c index 68eb5efec388..c0490464e92f 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -738,7 +738,10 @@ static int run_ksoftirqd(void * __bind_cpu) don't process */ if (cpu_is_offline((long)__bind_cpu)) goto wait_to_die; - do_softirq(); + local_irq_disable(); + if (local_softirq_pending()) + __do_softirq(); + local_irq_enable(); preempt_enable_no_resched(); cond_resched(); preempt_disable(); From 44951a60ff888add9e84f509ffce20052e45af94 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 4 Feb 2011 17:33:49 +0100 Subject: [PATCH 210/706] genirq: Remove dead code CONFIG_KSTAT_IRQS_ONDEMAND does not exist. It's not worth to implement it. Use sparse irqs if you care about memory consumption of the interrupt layer. Found by undertaker: http://vamos.informatik.uni-erlangen.de/trac/undertaker Signed-off-by: Thomas Gleixner --- kernel/irq/irqdesc.c | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 282f20230e67..a7ac6e1e7074 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -251,7 +251,6 @@ int __init early_irq_init(void) for (i = 0; i < count; i++) { desc[i].irq_data.irq = i; desc[i].irq_data.chip = &no_irq_chip; - /* TODO : do this allocation on-demand ... */ desc[i].kstat_irqs = alloc_percpu(unsigned int); alloc_masks(desc + i, GFP_KERNEL, node); desc_smp_init(desc + i, node); @@ -277,22 +276,6 @@ static void free_desc(unsigned int irq) static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) { -#if defined(CONFIG_KSTAT_IRQS_ONDEMAND) - struct irq_desc *desc; - unsigned int i; - - for (i = 0; i < cnt; i++) { - desc = irq_to_desc(start + i); - if (desc && !desc->kstat_irqs) { - unsigned int __percpu *stats = alloc_percpu(unsigned int); - - if (!stats) - return -1; - if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL) - free_percpu(stats); - } - } -#endif return start; } #endif /* !CONFIG_SPARSE_IRQ */ From cce1dac871f387d0f3da81440d85bd387d8fd5a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 24 Jan 2011 21:12:01 +0100 Subject: [PATCH 211/706] trivial: Fix Steven's Copyright typos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OK, the copyright allows you to write a copy, still I think the lawyers prefer the correct spelling. Signed-off-by: Uwe Kleine-König LKML-Reference: <1295899921-11333-1-git-send-email-u.kleine-koenig@pengutronix.de> Signed-off-by: Steven Rostedt --- scripts/kconfig/streamline_config.pl | 2 +- tools/testing/ktest/ktest.pl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/kconfig/streamline_config.pl b/scripts/kconfig/streamline_config.pl index fd81fc33d633..a4fe923c0131 100644 --- a/scripts/kconfig/streamline_config.pl +++ b/scripts/kconfig/streamline_config.pl @@ -1,6 +1,6 @@ #!/usr/bin/perl -w # -# Copywrite 2005-2009 - Steven Rostedt +# Copyright 2005-2009 - Steven Rostedt # Licensed under the terms of the GNU GPL License version 2 # # It's simple enough to figure out how this works. diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl index e1c62eeb88f5..ba7c63af6f3b 100755 --- a/tools/testing/ktest/ktest.pl +++ b/tools/testing/ktest/ktest.pl @@ -1,6 +1,6 @@ #!/usr/bin/perl -w # -# Copywrite 2010 - Steven Rostedt , Red Hat Inc. +# Copyright 2010 - Steven Rostedt , Red Hat Inc. # Licensed under the terms of the GNU GPL License version 2 # From f4d5c029bd6731baac0937324cef0f746e7d5ea7 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 26 Jan 2011 16:49:00 +0800 Subject: [PATCH 212/706] tracing: Compile time initialization for event flags value Compile time initialization is better than runtime initialization. Remove many early_initcall()s and many trace_init_flags_##name()s. Acked-by: Frederic Weisbecker Signed-off-by: Lai Jiangshan LKML-Reference: <4D3FDFFC.6030304@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- include/linux/syscalls.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 8e8968e74544..a17fcea2ca58 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -132,11 +132,11 @@ extern struct trace_event_functions exit_syscall_print_funcs; .class = &event_class_syscall_enter, \ .event.funcs = &enter_syscall_print_funcs, \ .data = (void *)&__syscall_meta_##sname,\ + .flags = TRACE_EVENT_FL_CAP_ANY, \ }; \ static struct ftrace_event_call __used \ __attribute__((section("_ftrace_events"))) \ - *__event_enter_##sname = &event_enter_##sname; \ - __TRACE_EVENT_FLAGS(enter_##sname, TRACE_EVENT_FL_CAP_ANY) + *__event_enter_##sname = &event_enter_##sname; #define SYSCALL_TRACE_EXIT_EVENT(sname) \ static struct syscall_metadata __syscall_meta_##sname; \ @@ -146,11 +146,11 @@ extern struct trace_event_functions exit_syscall_print_funcs; .class = &event_class_syscall_exit, \ .event.funcs = &exit_syscall_print_funcs, \ .data = (void *)&__syscall_meta_##sname,\ + .flags = TRACE_EVENT_FL_CAP_ANY, \ }; \ static struct ftrace_event_call __used \ __attribute__((section("_ftrace_events"))) \ - *__event_exit_##sname = &event_exit_##sname; \ - __TRACE_EVENT_FLAGS(exit_##sname, TRACE_EVENT_FL_CAP_ANY) + *__event_exit_##sname = &event_exit_##sname; #define SYSCALL_METADATA(sname, nb) \ SYSCALL_TRACE_ENTER_EVENT(sname); \ From 87d80de2800d087ea833cb79bc13f85ff34ed49f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 8 Feb 2011 13:19:49 -0500 Subject: [PATCH 213/706] tracing: Remove obsolete sched_switch tracer The trace events sched_switch and sched_wakeup do the same thing as the stand alone sched_switch tracer does. It is no longer needed. Signed-off-by: Steven Rostedt --- Documentation/trace/ftrace.txt | 110 ------------------------------ kernel/trace/trace_sched_switch.c | 48 ------------- 2 files changed, 158 deletions(-) diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index 557c1edeccaf..65eddb7cfa02 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -202,10 +202,6 @@ Here is the list of current tracers that may be configured. to draw a graph of function calls similar to C code source. - "sched_switch" - - Traces the context switches and wakeups between tasks. - "irqsoff" Traces the areas that disable interrupts and saves @@ -273,39 +269,6 @@ format, the function name that was traced "path_put" and the parent function that called this function "path_walk". The timestamp is the time at which the function was entered. -The sched_switch tracer also includes tracing of task wakeups -and context switches. - - ksoftirqd/1-7 [01] 1453.070013: 7:115:R + 2916:115:S - ksoftirqd/1-7 [01] 1453.070013: 7:115:R + 10:115:S - ksoftirqd/1-7 [01] 1453.070013: 7:115:R ==> 10:115:R - events/1-10 [01] 1453.070013: 10:115:S ==> 2916:115:R - kondemand/1-2916 [01] 1453.070013: 2916:115:S ==> 7:115:R - ksoftirqd/1-7 [01] 1453.070013: 7:115:S ==> 0:140:R - -Wake ups are represented by a "+" and the context switches are -shown as "==>". The format is: - - Context switches: - - Previous task Next Task - - :: ==> :: - - Wake ups: - - Current task Task waking up - - :: + :: - -The prio is the internal kernel priority, which is the inverse -of the priority that is usually displayed by user-space tools. -Zero represents the highest priority (99). Prio 100 starts the -"nice" priorities with 100 being equal to nice -20 and 139 being -nice 19. The prio "140" is reserved for the idle task which is -the lowest priority thread (pid 0). - - Latency trace format -------------------- @@ -491,79 +454,6 @@ x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6] latencies, as described in "Latency trace format". -sched_switch ------------- - -This tracer simply records schedule switches. Here is an example -of how to use it. - - # echo sched_switch > current_tracer - # echo 1 > tracing_enabled - # sleep 1 - # echo 0 > tracing_enabled - # cat trace - -# tracer: sched_switch -# -# TASK-PID CPU# TIMESTAMP FUNCTION -# | | | | | - bash-3997 [01] 240.132281: 3997:120:R + 4055:120:R - bash-3997 [01] 240.132284: 3997:120:R ==> 4055:120:R - sleep-4055 [01] 240.132371: 4055:120:S ==> 3997:120:R - bash-3997 [01] 240.132454: 3997:120:R + 4055:120:S - bash-3997 [01] 240.132457: 3997:120:R ==> 4055:120:R - sleep-4055 [01] 240.132460: 4055:120:D ==> 3997:120:R - bash-3997 [01] 240.132463: 3997:120:R + 4055:120:D - bash-3997 [01] 240.132465: 3997:120:R ==> 4055:120:R - -0 [00] 240.132589: 0:140:R + 4:115:S - -0 [00] 240.132591: 0:140:R ==> 4:115:R - ksoftirqd/0-4 [00] 240.132595: 4:115:S ==> 0:140:R - -0 [00] 240.132598: 0:140:R + 4:115:S - -0 [00] 240.132599: 0:140:R ==> 4:115:R - ksoftirqd/0-4 [00] 240.132603: 4:115:S ==> 0:140:R - sleep-4055 [01] 240.133058: 4055:120:S ==> 3997:120:R - [...] - - -As we have discussed previously about this format, the header -shows the name of the trace and points to the options. The -"FUNCTION" is a misnomer since here it represents the wake ups -and context switches. - -The sched_switch file only lists the wake ups (represented with -'+') and context switches ('==>') with the previous task or -current task first followed by the next task or task waking up. -The format for both of these is PID:KERNEL-PRIO:TASK-STATE. -Remember that the KERNEL-PRIO is the inverse of the actual -priority with zero (0) being the highest priority and the nice -values starting at 100 (nice -20). Below is a quick chart to map -the kernel priority to user land priorities. - - Kernel Space User Space - =============================================================== - 0(high) to 98(low) user RT priority 99(high) to 1(low) - with SCHED_RR or SCHED_FIFO - --------------------------------------------------------------- - 99 sched_priority is not used in scheduling - decisions(it must be specified as 0) - --------------------------------------------------------------- - 100(high) to 139(low) user nice -20(high) to 19(low) - --------------------------------------------------------------- - 140 idle task priority - --------------------------------------------------------------- - -The task states are: - - R - running : wants to run, may not actually be running - S - sleep : process is waiting to be woken up (handles signals) - D - disk sleep (uninterruptible sleep) : process must be woken up - (ignores signals) - T - stopped : process suspended - t - traced : process is being traced (with something like gdb) - Z - zombie : process waiting to be cleaned up - X - unknown - - ftrace_enabled -------------- diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 8f758d070c43..7e62c0a18456 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -247,51 +247,3 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr) ctx_trace = tr; } -static void stop_sched_trace(struct trace_array *tr) -{ - tracing_stop_sched_switch_record(); -} - -static int sched_switch_trace_init(struct trace_array *tr) -{ - ctx_trace = tr; - tracing_reset_online_cpus(tr); - tracing_start_sched_switch_record(); - return 0; -} - -static void sched_switch_trace_reset(struct trace_array *tr) -{ - if (sched_ref) - stop_sched_trace(tr); -} - -static void sched_switch_trace_start(struct trace_array *tr) -{ - sched_stopped = 0; -} - -static void sched_switch_trace_stop(struct trace_array *tr) -{ - sched_stopped = 1; -} - -static struct tracer sched_switch_trace __read_mostly = -{ - .name = "sched_switch", - .init = sched_switch_trace_init, - .reset = sched_switch_trace_reset, - .start = sched_switch_trace_start, - .stop = sched_switch_trace_stop, - .wait_pipe = poll_wait_pipe, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_sched_switch, -#endif -}; - -__init static int init_sched_switch_trace(void) -{ - return register_tracer(&sched_switch_trace); -} -device_initcall(init_sched_switch_trace); - From 6752ab4a9c30d5411b2dfdb251a3f1cb18aae487 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 8 Feb 2011 13:54:06 -0500 Subject: [PATCH 214/706] tracing: Deprecate tracing_enabled for tracing_on tracing_enabled should not be used, it is heavy weight and does not do much in helping lower the overhead. tracing_on should be used instead. Warn users to use tracing_on when tracing_enabled is used as it will soon be removed from the tracing directory. Signed-off-by: Steven Rostedt --- Documentation/trace/ftrace.txt | 38 +++++++++++++++++----------------- kernel/trace/trace.c | 4 ++++ 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index 65eddb7cfa02..67f1cc473257 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -80,11 +80,11 @@ of ftrace. Here is a list of some of the key files: tracers listed here can be configured by echoing their name into current_tracer. - tracing_enabled: + tracing_on: - This sets or displays whether the current_tracer - is activated and tracing or not. Echo 0 into this - file to disable the tracer or 1 to enable it. + This sets or displays whether writing to the trace + ring buffer is enabled. Echo 0 into this file to disable + the tracer or 1 to enable it. trace: @@ -497,10 +497,10 @@ an example: # echo irqsoff > current_tracer # echo latency-format > trace_options # echo 0 > tracing_max_latency - # echo 1 > tracing_enabled + # echo 1 > tracing_on # ls -ltr [...] - # echo 0 > tracing_enabled + # echo 0 > tracing_on # cat trace # tracer: irqsoff # @@ -605,10 +605,10 @@ is much like the irqsoff tracer. # echo preemptoff > current_tracer # echo latency-format > trace_options # echo 0 > tracing_max_latency - # echo 1 > tracing_enabled + # echo 1 > tracing_on # ls -ltr [...] - # echo 0 > tracing_enabled + # echo 0 > tracing_on # cat trace # tracer: preemptoff # @@ -753,10 +753,10 @@ tracers. # echo preemptirqsoff > current_tracer # echo latency-format > trace_options # echo 0 > tracing_max_latency - # echo 1 > tracing_enabled + # echo 1 > tracing_on # ls -ltr [...] - # echo 0 > tracing_enabled + # echo 0 > tracing_on # cat trace # tracer: preemptirqsoff # @@ -916,9 +916,9 @@ Instead of performing an 'ls', we will run 'sleep 1' under # echo wakeup > current_tracer # echo latency-format > trace_options # echo 0 > tracing_max_latency - # echo 1 > tracing_enabled + # echo 1 > tracing_on # chrt -f 5 sleep 1 - # echo 0 > tracing_enabled + # echo 0 > tracing_on # cat trace # tracer: wakeup # @@ -1030,9 +1030,9 @@ ftrace_enabled is set; otherwise this tracer is a nop. # sysctl kernel.ftrace_enabled=1 # echo function > current_tracer - # echo 1 > tracing_enabled + # echo 1 > tracing_on # usleep 1 - # echo 0 > tracing_enabled + # echo 0 > tracing_on # cat trace # tracer: function # @@ -1070,7 +1070,7 @@ int trace_fd; [...] int main(int argc, char *argv[]) { [...] - trace_fd = open(tracing_file("tracing_enabled"), O_WRONLY); + trace_fd = open(tracing_file("tracing_on"), O_WRONLY); [...] if (condition_hit()) { write(trace_fd, "0", 1); @@ -1521,9 +1521,9 @@ If I am only interested in sys_nanosleep and hrtimer_interrupt: # echo sys_nanosleep hrtimer_interrupt \ > set_ftrace_filter # echo function > current_tracer - # echo 1 > tracing_enabled + # echo 1 > tracing_on # usleep 1 - # echo 0 > tracing_enabled + # echo 0 > tracing_on # cat trace # tracer: ftrace # @@ -1769,9 +1769,9 @@ different. The trace is live. # echo function > current_tracer # cat trace_pipe > /tmp/trace.out & [1] 4153 - # echo 1 > tracing_enabled + # echo 1 > tracing_on # usleep 1 - # echo 0 > tracing_enabled + # echo 0 > tracing_on # cat trace # tracer: function # diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dc53ecb80589..8dc8da6733f9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2710,6 +2710,10 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, mutex_lock(&trace_types_lock); if (tracer_enabled ^ val) { + + /* Only need to warn if this is used to change the state */ + WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on"); + if (val) { tracer_enabled = 1; if (current_trace->start) From 6c53cbfced048c421e4f72cb2183465f68fbc5e7 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 6 Jan 2011 16:56:51 +0100 Subject: [PATCH 215/706] x86, microcode: Correct sysdev_add error path When we encounter an error while initting the microcode driver on a CPU, we must undo the previously added sysfs group. Cc: Tigran Aivazian Signed-off-by: Borislav Petkov Acked-by: Andreas Herrmann --- arch/x86/kernel/microcode_core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 1cca374a2bac..87af68e0e1e1 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -417,8 +417,10 @@ static int mc_sysdev_add(struct sys_device *sys_dev) if (err) return err; - if (microcode_init_cpu(cpu) == UCODE_ERROR) - err = -EINVAL; + if (microcode_init_cpu(cpu) == UCODE_ERROR) { + sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + return -EINVAL; + } return err; } From ffc7e8ac820bf9dd6106b01d3e64fecb5177cf43 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 30 Dec 2010 21:06:01 +0100 Subject: [PATCH 216/706] x86, microcode, AMD: Release firmware on error When the ucode magic is wrong, for whatever reason, we don't release the loaded firmware binary and its related resources. Make sure we do. Also, fix function naming to fit this driver's convention and shorten variable names. Signed-off-by: Borislav Petkov Acked-by: Andreas Herrmann --- arch/x86/kernel/microcode_amd.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 0fe6d1a66c38..ef91df0fb64d 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -278,27 +278,29 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) return state; } -static enum ucode_state request_microcode_fw(int cpu, struct device *device) +static enum ucode_state request_microcode_amd(int cpu, struct device *device) { const char *fw_name = "amd-ucode/microcode_amd.bin"; - const struct firmware *firmware; - enum ucode_state ret; + const struct firmware *fw; + enum ucode_state ret = UCODE_NFOUND; - if (request_firmware(&firmware, fw_name, device)) { + if (request_firmware(&fw, fw_name, device)) { printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); - return UCODE_NFOUND; + goto out; } - if (*(u32 *)firmware->data != UCODE_MAGIC) { - pr_err("invalid UCODE_MAGIC (0x%08x)\n", - *(u32 *)firmware->data); - return UCODE_ERROR; + ret = UCODE_ERROR; + if (*(u32 *)fw->data != UCODE_MAGIC) { + pr_err("Invalid UCODE_MAGIC (0x%08x)\n", *(u32 *)fw->data); + goto fw_release; } - ret = generic_load_microcode(cpu, firmware->data, firmware->size); + ret = generic_load_microcode(cpu, fw->data, fw->size); - release_firmware(firmware); +fw_release: + release_firmware(fw); +out: return ret; } @@ -319,7 +321,7 @@ static void microcode_fini_cpu_amd(int cpu) static struct microcode_ops microcode_amd_ops = { .request_microcode_user = request_microcode_user, - .request_microcode_fw = request_microcode_fw, + .request_microcode_fw = request_microcode_amd, .collect_cpu_info = collect_cpu_info_amd, .apply_microcode = apply_microcode_amd, .microcode_fini_cpu = microcode_fini_cpu_amd, From 10de52d6655ef0d4a1b8d2804db30208c26601ed Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 30 Dec 2010 22:10:12 +0100 Subject: [PATCH 217/706] x86, microcode, AMD: Simplify install_equiv_cpu_table There's no need to memcpy the ucode header in order to look at it only in this function - use the original buffer instead. Also, fix return type semantics by returning a negative value on error and a positive otherwise. Signed-off-by: Borislav Petkov Acked-by: Andreas Herrmann --- arch/x86/kernel/microcode_amd.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index ef91df0fb64d..9a451d7182f0 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -188,27 +188,22 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) static int install_equiv_cpu_table(const u8 *buf) { - u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE]; - unsigned int *buf_pos = (unsigned int *)container_hdr; - unsigned long size; + unsigned int *ibuf = (unsigned int *)buf; + unsigned int type = ibuf[1]; + unsigned int size = ibuf[2]; - get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE); - - size = buf_pos[2]; - - if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { + if (type != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { pr_err("error: invalid type field in container file section header\n"); - return 0; + return -EINVAL; } equiv_cpu_table = vmalloc(size); if (!equiv_cpu_table) { pr_err("failed to allocate equivalent CPU table\n"); - return 0; + return -ENOMEM; } - buf += UCODE_CONTAINER_HEADER_SIZE; - get_ucode_data(equiv_cpu_table, buf, size); + get_ucode_data(equiv_cpu_table, buf + UCODE_CONTAINER_HEADER_SIZE, size); return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ } @@ -232,7 +227,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) enum ucode_state state = UCODE_OK; offset = install_equiv_cpu_table(ucode_ptr); - if (!offset) { + if (offset < 0) { pr_err("failed to create equivalent cpu table\n"); return UCODE_ERROR; } From 7cc27349cbfec271eecec9488b4bf3f3fadb2ce4 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 31 Dec 2010 16:57:48 +0100 Subject: [PATCH 218/706] x86, microcode, AMD: Simplify get_next_ucode Do not copy the section header but look at it directly through the pointer. Also, make it return a ptr to a ucode header directly thus dropping a bunch of unneeded casts. Finally, simplify generic_load_microcode(), while at it. Signed-off-by: Borislav Petkov Acked-by: Andreas Herrmann --- arch/x86/kernel/microcode_amd.c | 68 ++++++++++++++++----------------- 1 file changed, 32 insertions(+), 36 deletions(-) diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 9a451d7182f0..8b58fc13e37e 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -88,9 +88,9 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) return 0; } -static int get_matching_microcode(int cpu, void *mc, int rev) +static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr, + int rev) { - struct microcode_header_amd *mc_header = mc; unsigned int current_cpu_id; u16 equiv_cpu_id = 0; unsigned int i = 0; @@ -109,17 +109,17 @@ static int get_matching_microcode(int cpu, void *mc, int rev) if (!equiv_cpu_id) return 0; - if (mc_header->processor_rev_id != equiv_cpu_id) + if (mc_hdr->processor_rev_id != equiv_cpu_id) return 0; /* ucode might be chipset specific -- currently we don't support this */ - if (mc_header->nb_dev_id || mc_header->sb_dev_id) { + if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) { pr_err("CPU%d: loading of chipset specific code not yet supported\n", cpu); return 0; } - if (mc_header->patch_id <= rev) + if (mc_hdr->patch_id <= rev) return 0; return 1; @@ -155,21 +155,18 @@ static int apply_microcode_amd(int cpu) return 0; } -static void * +static struct microcode_header_amd * get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) { + struct microcode_header_amd *mc; unsigned int total_size; - u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; - void *mc; - get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR); - - if (section_hdr[0] != UCODE_UCODE_TYPE) { + if (buf[0] != UCODE_UCODE_TYPE) { pr_err("error: invalid type field in container file section header\n"); return NULL; } - total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); + total_size = buf[4] + (buf[5] << 8); if (total_size > size || total_size > UCODE_MAX_SIZE) { pr_err("error: size mismatch\n"); @@ -218,12 +215,12 @@ static enum ucode_state generic_load_microcode(int cpu, const u8 *data, size_t size) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + struct microcode_header_amd *mc_hdr = NULL; + unsigned int mc_size, leftover; + unsigned long offset; const u8 *ucode_ptr = data; void *new_mc = NULL; - void *mc; int new_rev = uci->cpu_sig.rev; - unsigned int leftover; - unsigned long offset; enum ucode_state state = UCODE_OK; offset = install_equiv_cpu_table(ucode_ptr); @@ -236,38 +233,37 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) leftover = size - offset; while (leftover) { - unsigned int uninitialized_var(mc_size); - struct microcode_header_amd *mc_header; - - mc = get_next_ucode(ucode_ptr, leftover, &mc_size); - if (!mc) + mc_hdr = get_next_ucode(ucode_ptr, leftover, &mc_size); + if (!mc_hdr) break; - mc_header = (struct microcode_header_amd *)mc; - if (get_matching_microcode(cpu, mc, new_rev)) { + if (get_matching_microcode(cpu, mc_hdr, new_rev)) { vfree(new_mc); - new_rev = mc_header->patch_id; - new_mc = mc; + new_rev = mc_hdr->patch_id; + new_mc = mc_hdr; } else - vfree(mc); + vfree(mc_hdr); ucode_ptr += mc_size; leftover -= mc_size; } - if (new_mc) { - if (!leftover) { - vfree(uci->mc); - uci->mc = new_mc; - pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", - cpu, new_rev, uci->cpu_sig.rev); - } else { - vfree(new_mc); - state = UCODE_ERROR; - } - } else + if (!new_mc) { state = UCODE_NFOUND; + goto free_table; + } + if (!leftover) { + vfree(uci->mc); + uci->mc = new_mc; + pr_debug("CPU%d update ucode to version 0x%x (from 0x%x)\n", + cpu, new_rev, uci->cpu_sig.rev); + } else { + vfree(new_mc); + state = UCODE_ERROR; + } + +free_table: free_equiv_cpu_table(); return state; From 05ff02e4c0686051fcb074aec92df03f2c184fd1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 5 Jan 2011 18:04:11 +0100 Subject: [PATCH 219/706] x86, microcode, AMD: Remove unneeded memset call collect_cpu_info_amd() clears its csig arg but this is done in the microcode_core's collect_cpu_info() by clearing the embedding struct ucode_cpu_info. Drop it. Signed-off-by: Borislav Petkov Acked-by: Andreas Herrmann --- arch/x86/kernel/microcode_amd.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 8b58fc13e37e..274fa4063c7a 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -77,7 +77,6 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) struct cpuinfo_x86 *c = &cpu_data(cpu); u32 dummy; - memset(csig, 0, sizeof(*csig)); if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { pr_warning("microcode: CPU%d: AMD CPU family 0x%x not " "supported\n", cpu, c->x86); From 258721ef34fce97a7a6ca9cebebb303827645868 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 5 Jan 2011 18:13:19 +0100 Subject: [PATCH 220/706] x86, microcode, AMD: Cleanup dmesg output Unify pr_* to use pr_fmt, shorten messages, correct type formatting. Signed-off-by: Borislav Petkov Acked-by: Andreas Herrmann --- arch/x86/kernel/microcode_amd.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 274fa4063c7a..adbcef4f6333 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -78,12 +78,13 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) u32 dummy; if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { - pr_warning("microcode: CPU%d: AMD CPU family 0x%x not " - "supported\n", cpu, c->x86); + pr_warning("CPU%d: family %d not supported\n", cpu, c->x86); return -1; } + rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); - pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev); + pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev); + return 0; } @@ -113,7 +114,7 @@ static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr, /* ucode might be chipset specific -- currently we don't support this */ if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) { - pr_err("CPU%d: loading of chipset specific code not yet supported\n", + pr_err("CPU%d: chipset specific code not yet supported\n", cpu); return 0; } @@ -143,12 +144,12 @@ static int apply_microcode_amd(int cpu) /* check current patch id and patch's id for match */ if (rev != mc_amd->hdr.patch_id) { - pr_err("CPU%d: update failed (for patch_level=0x%x)\n", + pr_err("CPU%d: update failed for patch_level=0x%08x\n", cpu, mc_amd->hdr.patch_id); return -1; } - pr_info("CPU%d: updated (new patch_level=0x%x)\n", cpu, rev); + pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev); uci->cpu_sig.rev = rev; return 0; @@ -161,14 +162,14 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) unsigned int total_size; if (buf[0] != UCODE_UCODE_TYPE) { - pr_err("error: invalid type field in container file section header\n"); + pr_err("invalid type field in container file section header\n"); return NULL; } total_size = buf[4] + (buf[5] << 8); if (total_size > size || total_size > UCODE_MAX_SIZE) { - pr_err("error: size mismatch\n"); + pr_err("section size mismatch\n"); return NULL; } @@ -189,7 +190,8 @@ static int install_equiv_cpu_table(const u8 *buf) unsigned int size = ibuf[2]; if (type != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { - pr_err("error: invalid type field in container file section header\n"); + pr_err("empty section/" + "invalid type field in container file section header\n"); return -EINVAL; } @@ -219,7 +221,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) unsigned long offset; const u8 *ucode_ptr = data; void *new_mc = NULL; - int new_rev = uci->cpu_sig.rev; + unsigned int new_rev = uci->cpu_sig.rev; enum ucode_state state = UCODE_OK; offset = install_equiv_cpu_table(ucode_ptr); @@ -255,8 +257,8 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) if (!leftover) { vfree(uci->mc); uci->mc = new_mc; - pr_debug("CPU%d update ucode to version 0x%x (from 0x%x)\n", - cpu, new_rev, uci->cpu_sig.rev); + pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n", + cpu, uci->cpu_sig.rev, new_rev); } else { vfree(new_mc); state = UCODE_ERROR; @@ -275,13 +277,13 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device) enum ucode_state ret = UCODE_NFOUND; if (request_firmware(&fw, fw_name, device)) { - printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); + pr_err("failed to load file %s\n", fw_name); goto out; } ret = UCODE_ERROR; if (*(u32 *)fw->data != UCODE_MAGIC) { - pr_err("Invalid UCODE_MAGIC (0x%08x)\n", *(u32 *)fw->data); + pr_err("invalid magic value (0x%08x)\n", *(u32 *)fw->data); goto fw_release; } From 22b7fcdae562b6792b3f5517e89fd7e0337180ae Mon Sep 17 00:00:00 2001 From: Torben Hohn Date: Thu, 27 Jan 2011 16:00:12 +0100 Subject: [PATCH 221/706] mn10300: Switch do_timer() to xtimer_update() Only one CPU gets the timer interrupt so mn10300_last_tsc does not need to be protected by xtime lock. Remove xtime lovking and use xtime_update() which does the locking itself. Signed-off-by: Torben Hohn Cc: David Howells Cc: Koichi Yasutake LKML-Reference: <20110127150011.23248.62040.stgit@localhost> Signed-off-by: Thomas Gleixner --- arch/mn10300/kernel/time.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/mn10300/kernel/time.c b/arch/mn10300/kernel/time.c index 75da468090b9..5b955000626d 100644 --- a/arch/mn10300/kernel/time.c +++ b/arch/mn10300/kernel/time.c @@ -104,8 +104,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) unsigned tsc, elapse; irqreturn_t ret; - write_seqlock(&xtime_lock); - while (tsc = get_cycles(), elapse = tsc - mn10300_last_tsc, /* time elapsed since last * tick */ @@ -114,11 +112,9 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) mn10300_last_tsc += MN10300_TSC_PER_HZ; /* advance the kernel's time tracking system */ - do_timer(1); + xtime_update(1); } - write_sequnlock(&xtime_lock); - ret = local_timer_interrupt(); #ifdef CONFIG_SMP send_IPI_allbutself(LOCAL_TIMER_IPI); From 44d60c0f5c58c2168f31df9a481761451840eb54 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 10 Feb 2011 12:19:47 +0100 Subject: [PATCH 222/706] x86, microcode, AMD: Extend ucode size verification The different families have a different max size for the ucode patch, adjust size checking to the family we're running on. Also, do not vzalloc the max size of the ucode but only the actual size that is passed on from the firmware loader. Signed-off-by: Borislav Petkov --- arch/x86/kernel/microcode_amd.c | 62 ++++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index adbcef4f6333..9fb8405451c5 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -66,7 +66,6 @@ struct microcode_amd { unsigned int mpb[0]; }; -#define UCODE_MAX_SIZE 2048 #define UCODE_CONTAINER_SECTION_HDR 8 #define UCODE_CONTAINER_HEADER_SIZE 12 @@ -155,31 +154,60 @@ static int apply_microcode_amd(int cpu) return 0; } -static struct microcode_header_amd * -get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) +static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size) { - struct microcode_header_amd *mc; - unsigned int total_size; + struct cpuinfo_x86 *c = &cpu_data(cpu); + unsigned int max_size, actual_size; + +#define F1XH_MPB_MAX_SIZE 2048 +#define F14H_MPB_MAX_SIZE 1824 +#define F15H_MPB_MAX_SIZE 4096 + + switch (c->x86) { + case 0x14: + max_size = F14H_MPB_MAX_SIZE; + break; + case 0x15: + max_size = F15H_MPB_MAX_SIZE; + break; + default: + max_size = F1XH_MPB_MAX_SIZE; + break; + } + + actual_size = buf[4] + (buf[5] << 8); + + if (actual_size > size || actual_size > max_size) { + pr_err("section size mismatch\n"); + return 0; + } + + return actual_size; +} + +static struct microcode_header_amd * +get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size) +{ + struct microcode_header_amd *mc = NULL; + unsigned int actual_size = 0; if (buf[0] != UCODE_UCODE_TYPE) { pr_err("invalid type field in container file section header\n"); - return NULL; + goto out; } - total_size = buf[4] + (buf[5] << 8); + actual_size = verify_ucode_size(cpu, buf, size); + if (!actual_size) + goto out; - if (total_size > size || total_size > UCODE_MAX_SIZE) { - pr_err("section size mismatch\n"); - return NULL; - } - - mc = vzalloc(UCODE_MAX_SIZE); + mc = vzalloc(actual_size); if (!mc) - return NULL; + goto out; - get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size); - *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR; + get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, actual_size); + *mc_size = actual_size + UCODE_CONTAINER_SECTION_HDR; +out: return mc; } @@ -234,7 +262,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) leftover = size - offset; while (leftover) { - mc_hdr = get_next_ucode(ucode_ptr, leftover, &mc_size); + mc_hdr = get_next_ucode(cpu, ucode_ptr, leftover, &mc_size); if (!mc_hdr) break; From 94d1ac8b55799be10487fff9766cce6d6628462a Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 9 Feb 2011 08:22:46 +0000 Subject: [PATCH 223/706] x86: Reduce back the alignment of the per-CPU data section This complements commit: 47f19a0814e8: percpu: Remove the multi-page alignment facility reverting one leftover of: fe8e0c25cad2: x86, 32-bit: Align percpu area and irq stacks to THREAD_SIZE Signed-off-by: Jan Beulich Acked-by: Alexander van Heukelum Cc: Linus Torvalds LKML-Reference: <4D525CE60200007800030EE5@vpn.id2.novell.com> Signed-off-by: Ingo Molnar Cc: Alexander van Heukelum --- arch/x86/kernel/vmlinux.lds.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index bf4700755184..e9f7a3c838c3 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -305,7 +305,7 @@ SECTIONS } #if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) - PERCPU(THREAD_SIZE) + PERCPU(PAGE_SIZE) #endif . = ALIGN(PAGE_SIZE); From b82fef82d56789439e6be638a87a1a5bba1e6e75 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 9 Feb 2011 08:24:34 +0000 Subject: [PATCH 224/706] x86: Partly unify asm-offsets_{32,64}.c Just consolidating the common parts. Full unification would seem straight forward, but it's not clear the necessary #ifdef-s would be acceptable. Signed-off-by: Jan Beulich LKML-Reference: <4D525D520200007800030EE9@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/asm-offsets.c | 70 +++++++++++++++++++++++++ arch/x86/kernel/asm-offsets_32.c | 69 ------------------------ arch/x86/kernel/asm-offsets_64.c | 90 +++++--------------------------- 3 files changed, 84 insertions(+), 145 deletions(-) diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index cfa82c899f47..2b141c1915a5 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -1,5 +1,75 @@ +/* + * Generate definitions needed by assembly language modules. + * This code generates raw asm output which is post-processed to extract + * and format the required data. + */ +#define COMPILE_OFFSETS + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_XEN +#include +#endif + #ifdef CONFIG_X86_32 # include "asm-offsets_32.c" #else # include "asm-offsets_64.c" #endif + +void common(void) { + BLANK(); + DEFINE(PAGE_SIZE_asm, PAGE_SIZE); + DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT); + DEFINE(THREAD_SIZE_asm, THREAD_SIZE); + + BLANK(); + OFFSET(TI_flags, thread_info, flags); + OFFSET(TI_status, thread_info, status); + OFFSET(TI_addr_limit, thread_info, addr_limit); + OFFSET(TI_preempt_count, thread_info, preempt_count); + + BLANK(); + OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); + + BLANK(); + OFFSET(pbe_address, pbe, address); + OFFSET(pbe_orig_address, pbe, orig_address); + OFFSET(pbe_next, pbe, next); + +#ifdef CONFIG_PARAVIRT + BLANK(); + OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); + OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops); + OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops); + OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); + OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); + OFFSET(PV_CPU_iret, pv_cpu_ops, iret); + OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); + OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); + OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); +#endif + +#ifdef CONFIG_XEN + BLANK(); + OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); + OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); +#endif + + BLANK(); + OFFSET(BP_scratch, boot_params, scratch); + OFFSET(BP_loadflags, boot_params, hdr.loadflags); + OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); + OFFSET(BP_version, boot_params, hdr.version); + OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); +} diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 1a4088dda37a..c29d631af6fc 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -1,26 +1,4 @@ -/* - * Generate definitions needed by assembly language modules. - * This code generates raw asm output which is post-processed - * to extract and format the required data. - */ - -#include -#include -#include -#include -#include -#include #include -#include -#include -#include -#include -#include -#include -#include -#include - -#include #include #include "../../../drivers/lguest/lg.h" @@ -51,21 +29,10 @@ void foo(void) OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); BLANK(); - OFFSET(TI_task, thread_info, task); - OFFSET(TI_exec_domain, thread_info, exec_domain); - OFFSET(TI_flags, thread_info, flags); - OFFSET(TI_status, thread_info, status); - OFFSET(TI_preempt_count, thread_info, preempt_count); - OFFSET(TI_addr_limit, thread_info, addr_limit); - OFFSET(TI_restart_block, thread_info, restart_block); OFFSET(TI_sysenter_return, thread_info, sysenter_return); OFFSET(TI_cpu, thread_info, cpu); BLANK(); - OFFSET(GDS_size, desc_ptr, size); - OFFSET(GDS_address, desc_ptr, address); - BLANK(); - OFFSET(PT_EBX, pt_regs, bx); OFFSET(PT_ECX, pt_regs, cx); OFFSET(PT_EDX, pt_regs, dx); @@ -85,42 +52,13 @@ void foo(void) OFFSET(PT_OLDSS, pt_regs, ss); BLANK(); - OFFSET(EXEC_DOMAIN_handler, exec_domain, handler); OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); BLANK(); - OFFSET(pbe_address, pbe, address); - OFFSET(pbe_orig_address, pbe, orig_address); - OFFSET(pbe_next, pbe, next); - /* Offset from the sysenter stack to tss.sp0 */ DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - sizeof(struct tss_struct)); - DEFINE(PAGE_SIZE_asm, PAGE_SIZE); - DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT); - DEFINE(THREAD_SIZE_asm, THREAD_SIZE); - - OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); - -#ifdef CONFIG_PARAVIRT - BLANK(); - OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); - OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops); - OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops); - OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); - OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); - OFFSET(PV_CPU_iret, pv_cpu_ops, iret); - OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); - OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); -#endif - -#ifdef CONFIG_XEN - BLANK(); - OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); - OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); -#endif - #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) BLANK(); OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); @@ -139,11 +77,4 @@ void foo(void) OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); #endif - - BLANK(); - OFFSET(BP_scratch, boot_params, scratch); - OFFSET(BP_loadflags, boot_params, hdr.loadflags); - OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); - OFFSET(BP_version, boot_params, hdr.version); - OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 4a6aeedcd965..e72a1194af22 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -1,27 +1,4 @@ -/* - * Generate definitions needed by assembly language modules. - * This code generates raw asm output which is post-processed to extract - * and format the required data. - */ -#define COMPILE_OFFSETS - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include -#include -#include - -#include - -#include #define __NO_STUBS 1 #undef __SYSCALL @@ -33,41 +10,19 @@ static char syscalls[] = { int main(void) { -#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry)) - ENTRY(state); - ENTRY(flags); - ENTRY(pid); - BLANK(); -#undef ENTRY -#define ENTRY(entry) DEFINE(TI_ ## entry, offsetof(struct thread_info, entry)) - ENTRY(flags); - ENTRY(addr_limit); - ENTRY(preempt_count); - ENTRY(status); -#ifdef CONFIG_IA32_EMULATION - ENTRY(sysenter_return); -#endif - BLANK(); -#undef ENTRY #ifdef CONFIG_PARAVIRT - BLANK(); - OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); - OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops); - OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops); - OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); - OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame); - OFFSET(PV_CPU_iret, pv_cpu_ops, iret); OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32); OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); - OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); - OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); + BLANK(); #endif - #ifdef CONFIG_IA32_EMULATION -#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry)) + OFFSET(TI_sysenter_return, thread_info, sysenter_return); + BLANK(); + +#define ENTRY(entry) OFFSET(IA32_SIGCONTEXT_ ## entry, sigcontext_ia32, entry) ENTRY(ax); ENTRY(bx); ENTRY(cx); @@ -79,15 +34,12 @@ int main(void) ENTRY(ip); BLANK(); #undef ENTRY - DEFINE(IA32_RT_SIGFRAME_sigcontext, - offsetof (struct rt_sigframe_ia32, uc.uc_mcontext)); + + OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext); BLANK(); #endif - DEFINE(pbe_address, offsetof(struct pbe, address)); - DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address)); - DEFINE(pbe_next, offsetof(struct pbe, next)); - BLANK(); -#define ENTRY(entry) DEFINE(pt_regs_ ## entry, offsetof(struct pt_regs, entry)) + +#define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry) ENTRY(bx); ENTRY(bx); ENTRY(cx); @@ -107,7 +59,8 @@ int main(void) ENTRY(flags); BLANK(); #undef ENTRY -#define ENTRY(entry) DEFINE(saved_context_ ## entry, offsetof(struct saved_context, entry)) + +#define ENTRY(entry) OFFSET(saved_context_ ## entry, saved_context, entry) ENTRY(cr0); ENTRY(cr2); ENTRY(cr3); @@ -115,26 +68,11 @@ int main(void) ENTRY(cr8); BLANK(); #undef ENTRY - DEFINE(TSS_ist, offsetof(struct tss_struct, x86_tss.ist)); - BLANK(); - DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx)); + + OFFSET(TSS_ist, tss_struct, x86_tss.ist); BLANK(); + DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); - BLANK(); - OFFSET(BP_scratch, boot_params, scratch); - OFFSET(BP_loadflags, boot_params, hdr.loadflags); - OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); - OFFSET(BP_version, boot_params, hdr.version); - OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); - - BLANK(); - DEFINE(PAGE_SIZE_asm, PAGE_SIZE); -#ifdef CONFIG_XEN - BLANK(); - OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); - OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); -#undef ENTRY -#endif return 0; } From 986c011ddbb3ed44b35e1bfd67f6aa60b293b495 Mon Sep 17 00:00:00 2001 From: David Daney Date: Wed, 9 Feb 2011 16:04:25 -0800 Subject: [PATCH 225/706] genirq: Call bus_lock/unlock functions in setup_irq() irq_chips that supply .irq_bus_lock/.irq_bus_sync_unlock functions, expect that the other chip methods will be called inside of calls to the pair. If this expectation is not met, things tend to not work. Make setup_irq() call chip_bus_lock()/chip_bus_sync_unlock() too. For the vast majority of irq_chips, this will be a NOP as most don't have these bus lock functions. [ tglx: No we don't want to call that in __setup_irq(). Way too many error exit pathes. ] Signed-off-by: David Daney LKML-Reference: <1297296265-18680-1-git-send-email-ddaney@caviumnetworks.com> Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0caa59f747dd..a00bf2cd67ed 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -871,9 +871,14 @@ out_thread: */ int setup_irq(unsigned int irq, struct irqaction *act) { + int retval; struct irq_desc *desc = irq_to_desc(irq); - return __setup_irq(irq, desc, act); + chip_bus_lock(desc); + retval = __setup_irq(irq, desc, act); + chip_bus_sync_unlock(desc); + + return retval; } EXPORT_SYMBOL_GPL(setup_irq); From 0849327d13a0bd7f6512b7c21f4b3e79efb2076d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 11 Feb 2011 12:09:54 -0200 Subject: [PATCH 226/706] perf report: Fix initializion of annotate symbol priv area We only allocate it when in TUI mode. In --stdio mode unconditionally initializing this area leads to memory corruption. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index f403aced4cba..f9a99a1ce609 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -44,6 +44,7 @@ static const char default_pretty_printing_style[] = "normal"; static const char *pretty_printing_style = default_pretty_printing_style; static char callchain_default_opt[] = "fractal,0.5"; +static symbol_filter_t annotate_init; static struct hists *perf_session__hists_findnew(struct perf_session *self, u64 event_stream, u32 type, @@ -167,7 +168,7 @@ static int process_sample_event(union perf_event *event, struct perf_event_attr *attr; if (perf_event__preprocess_sample(event, session, &al, sample, - symbol__annotate_init) < 0) { + annotate_init) < 0) { fprintf(stderr, "problem processing %d event, skipping it.\n", event->header.type); return -1; @@ -520,6 +521,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __used) */ if (use_browser > 0) { symbol_conf.priv_size = sizeof(struct annotation); + annotate_init = symbol__annotate_init; /* * For searching by name on the "Browse map details". * providing it only in verbose mode not to bloat too From b052181a985592f81767f631f9f42accb4b436cd Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 11 Feb 2011 15:23:56 +0000 Subject: [PATCH 227/706] xen: events: mark cpu_evtchn_mask_p as __refdata This variable starts out pointing at init_evtchn_mask which is marked __initdata but is set to point to a non-init data region in xen_init_IRQ which is itself an __init function so this is safe. Signed-off-by: Ian Campbell Tested-and-acked-by: Andrew Jones Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 74681478100a..a31389036b15 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -114,7 +114,7 @@ struct cpu_evtchn_s { static __initdata struct cpu_evtchn_s init_evtchn_mask = { .bits[0 ... (NR_EVENT_CHANNELS/BITS_PER_LONG)-1] = ~0ul, }; -static struct cpu_evtchn_s *cpu_evtchn_mask_p = &init_evtchn_mask; +static struct cpu_evtchn_s __refdata *cpu_evtchn_mask_p = &init_evtchn_mask; static inline unsigned long *cpu_evtchn_mask(int cpu) { From 6b08cfebd3bd346d8a2fd68a2265fc7736849802 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 11 Feb 2011 15:23:58 +0000 Subject: [PATCH 228/706] xen p2m: annotate variable which appears unused CC arch/x86/xen/p2m.o arch/x86/xen/p2m.c: In function 'm2p_remove_override': arch/x86/xen/p2m.c:460: warning: 'address' may be used uninitialized in this function arch/x86/xen/p2m.c: In function 'm2p_add_override': arch/x86/xen/p2m.c:426: warning: 'address' may be used uninitialized in this function In actual fact address is inialised in one "if (!PageHighMem(page))" statement and used in a second and so is always initialised before use. Signed-off-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index fd12d7ce7ff9..89342e5fd082 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -421,7 +421,7 @@ int m2p_add_override(unsigned long mfn, struct page *page) { unsigned long flags; unsigned long pfn; - unsigned long address; + unsigned long uninitialized_var(address); unsigned level; pte_t *ptep = NULL; @@ -455,7 +455,7 @@ int m2p_remove_override(struct page *page) unsigned long flags; unsigned long mfn; unsigned long pfn; - unsigned long address; + unsigned long uninitialized_var(address); unsigned level; pte_t *ptep = NULL; From 44b46c3ef805793ab3a7730dc71c72d0f258ea8e Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 11 Feb 2011 16:37:41 +0000 Subject: [PATCH 229/706] xen: annotate functions which only call into __init at start of day Both xen_hvm_init_shared_info and xen_build_mfn_list_list can be called at resume time as well as at start of day but only reference __init functions (extend_brk) at start of day. Hence annotate with __ref. WARNING: arch/x86/built-in.o(.text+0x4f1): Section mismatch in reference from the function xen_hvm_init_shared_info() to the function .init.text:extend_brk() The function xen_hvm_init_shared_info() references the function __init extend_brk(). This is often because xen_hvm_init_shared_info lacks a __init annotation or the annotation of extend_brk is wrong. xen_hvm_init_shared_info calls extend_brk() iff !shared_info_page and initialises shared_info_page with the result. This happens at start of day only. WARNING: arch/x86/built-in.o(.text+0x599b): Section mismatch in reference from the function xen_build_mfn_list_list() to the function .init.text:extend_brk() The function xen_build_mfn_list_list() references the function __init extend_brk(). This is often because xen_build_mfn_list_list lacks a __init annotation or the annotation of extend_brk is wrong. (this warning occurs multiple times) xen_build_mfn_list_list only calls extend_brk() at boot time, while building the initial mfn list list Signed-off-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 2 +- arch/x86/xen/p2m.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 50542efe45fb..28e6d42ce2b8 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1292,7 +1292,7 @@ static int init_hvm_pv_info(int *major, int *minor) return 0; } -void xen_hvm_init_shared_info(void) +void __ref xen_hvm_init_shared_info(void) { int cpu; struct xen_add_to_physmap xatp; diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 89342e5fd082..05cfc6abbe10 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -136,7 +136,7 @@ static void p2m_init(unsigned long *p2m) * - After resume we're called from within stop_machine, but the mfn * tree should alreay be completely allocated. */ -void xen_build_mfn_list_list(void) +void __ref xen_build_mfn_list_list(void) { unsigned long pfn; From 868baf07b1a259f5f3803c1dc2777b6c358f83cf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Feb 2011 21:26:13 -0500 Subject: [PATCH 230/706] ftrace: Fix memory leak with function graph and cpu hotplug When the fuction graph tracer starts, it needs to make a special stack for each task to save the real return values of the tasks. All running tasks have this stack created, as well as any new tasks. On CPU hot plug, the new idle task will allocate a stack as well when init_idle() is called. The problem is that cpu hotplug does not create a new idle_task. Instead it uses the idle task that existed when the cpu went down. ftrace_graph_init_task() will add a new ret_stack to the task that is given to it. Because a clone will make the task have a stack of its parent it does not check if the task's ret_stack is already NULL or not. When the CPU hotplug code starts a CPU up again, it will allocate a new stack even though one already existed for it. The solution is to treat the idle_task specially. In fact, the function_graph code already does, just not at init_idle(). Instead of using the ftrace_graph_init_task() for the idle task, which that function expects the task to be a clone, have a separate ftrace_graph_init_idle_task(). Also, we will create a per_cpu ret_stack that is used by the idle task. When we call ftrace_graph_init_idle_task() it will check if the idle task's ret_stack is NULL, if it is, then it will assign it the per_cpu ret_stack. Reported-by: Benjamin Herrenschmidt Suggested-by: Peter Zijlstra Cc: Stable Tree Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 2 ++ kernel/sched.c | 2 +- kernel/trace/ftrace.c | 52 ++++++++++++++++++++++++++++++++++++------ 3 files changed, 48 insertions(+), 8 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index dcd6a7c3a435..ca29e03c1fac 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -428,6 +428,7 @@ extern void unregister_ftrace_graph(void); extern void ftrace_graph_init_task(struct task_struct *t); extern void ftrace_graph_exit_task(struct task_struct *t); +extern void ftrace_graph_init_idle_task(struct task_struct *t, int cpu); static inline int task_curr_ret_stack(struct task_struct *t) { @@ -451,6 +452,7 @@ static inline void unpause_graph_tracing(void) static inline void ftrace_graph_init_task(struct task_struct *t) { } static inline void ftrace_graph_exit_task(struct task_struct *t) { } +static inline void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) { } static inline int register_ftrace_graph(trace_func_graph_ret_t retfunc, trace_func_graph_ent_t entryfunc) diff --git a/kernel/sched.c b/kernel/sched.c index 18d38e4ec7ba..fbe86cb04b61 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5571,7 +5571,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) * The idle tasks have their own, simple scheduling class: */ idle->sched_class = &idle_sched_class; - ftrace_graph_init_task(idle); + ftrace_graph_init_idle_task(idle, cpu); } /* diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f3dadae83883..888b611897d3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3328,7 +3328,7 @@ static int start_graph_tracing(void) /* The cpu_boot init_task->ret_stack will never be freed */ for_each_online_cpu(cpu) { if (!idle_task(cpu)->ret_stack) - ftrace_graph_init_task(idle_task(cpu)); + ftrace_graph_init_idle_task(idle_task(cpu), cpu); } do { @@ -3418,6 +3418,49 @@ void unregister_ftrace_graph(void) mutex_unlock(&ftrace_lock); } +static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack); + +static void +graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) +{ + atomic_set(&t->tracing_graph_pause, 0); + atomic_set(&t->trace_overrun, 0); + t->ftrace_timestamp = 0; + /* make curr_ret_stack visable before we add the ret_stack */ + smp_wmb(); + t->ret_stack = ret_stack; +} + +/* + * Allocate a return stack for the idle task. May be the first + * time through, or it may be done by CPU hotplug online. + */ +void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) +{ + t->curr_ret_stack = -1; + /* + * The idle task has no parent, it either has its own + * stack or no stack at all. + */ + if (t->ret_stack) + WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu)); + + if (ftrace_graph_active) { + struct ftrace_ret_stack *ret_stack; + + ret_stack = per_cpu(idle_ret_stack, cpu); + if (!ret_stack) { + ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH + * sizeof(struct ftrace_ret_stack), + GFP_KERNEL); + if (!ret_stack) + return; + per_cpu(idle_ret_stack, cpu) = ret_stack; + } + graph_init_task(t, ret_stack); + } +} + /* Allocate a return stack for newly created task */ void ftrace_graph_init_task(struct task_struct *t) { @@ -3433,12 +3476,7 @@ void ftrace_graph_init_task(struct task_struct *t) GFP_KERNEL); if (!ret_stack) return; - atomic_set(&t->tracing_graph_pause, 0); - atomic_set(&t->trace_overrun, 0); - t->ftrace_timestamp = 0; - /* make curr_ret_stack visable before we add the ret_stack */ - smp_wmb(); - t->ret_stack = ret_stack; + graph_init_task(t, ret_stack); } } From 0de4b34d466bae571b50f41c7296b85248205e35 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 14 Feb 2011 14:48:07 +0900 Subject: [PATCH 231/706] tracing/kprobe: Fix NULL pointer deref check Add NULL check for avoiding NULL pointer deref. This bug has been introduced by: 1ff511e35ed8: tracing/kprobes: Add bitfield type which causes a null pointer dereference bug when kprobe-tracer parses an argument without type. Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Masami Hiramatsu Cc: 2nddept-manager@sdl.hitachi.co.jp Cc: Peter Zijlstra LKML-Reference: <20110214054807.8919.69740.stgit@ltc236.sdl.hitachi.co.jp> Signed-off-by: Ingo Molnar Reported-by: Arnaldo Carvalho de Melo --- kernel/trace/trace_kprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ccdc542022c3..8435b43b1782 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -935,7 +935,7 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp, parg->offset = tp->size; tp->size += parg->type->size; ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); - if (ret >= 0) + if (ret >= 0 && t != NULL) ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); if (ret >= 0) { parg->fetch_size.fn = get_fetch_size_function(parg->type, From 60f6e65d7887c257392313755f95540ef5e7ea89 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 17 Jan 2011 10:52:02 +0800 Subject: [PATCH 232/706] x86: Cleanup vector usage Cleanup the vector usage and make them continuous if possible. Signed-off-by: Shaohua Li Cc: Andi Kleen Cc: Eric Dumazet LKML-Reference: <1295232722.1949.707.camel@sli10-conroe> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/irq_vectors.h | 52 ++++++++++++++++-------------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 6af0894dafb4..42f0d4a30f1b 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -1,6 +1,7 @@ #ifndef _ASM_X86_IRQ_VECTORS_H #define _ASM_X86_IRQ_VECTORS_H +#include /* * Linux IRQ vector layout. * @@ -16,8 +17,8 @@ * Vectors 0 ... 31 : system traps and exceptions - hardcoded events * Vectors 32 ... 127 : device interrupts * Vector 128 : legacy int80 syscall interface - * Vectors 129 ... 237 : device interrupts - * Vectors 238 ... 255 : special interrupts + * Vectors 129 ... 229 : device interrupts + * Vectors 230 ... 255 : special interrupts * * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. * @@ -96,10 +97,25 @@ #define THRESHOLD_APIC_VECTOR 0xf9 #define REBOOT_VECTOR 0xf8 -/* f0-f7 used for spreading out TLB flushes: */ -#define INVALIDATE_TLB_VECTOR_END 0xf7 -#define INVALIDATE_TLB_VECTOR_START 0xf0 -#define NUM_INVALIDATE_TLB_VECTORS 8 +/* + * Generic system vector for platform specific use + */ +#define X86_PLATFORM_IPI_VECTOR 0xf7 + +/* + * IRQ work vector: + */ +#define IRQ_WORK_VECTOR 0xf6 + +#define UV_BAU_MESSAGE 0xf5 + +/* + * Self IPI vector for machine checks + */ +#define MCE_SELF_VECTOR 0xf4 + +/* Xen vector callback to receive events in a HVM domain */ +#define XEN_HVM_EVTCHN_CALLBACK 0xf3 /* * Local APIC timer IRQ vector is on a different priority level, @@ -108,25 +124,11 @@ */ #define LOCAL_TIMER_VECTOR 0xef -/* - * Generic system vector for platform specific use - */ -#define X86_PLATFORM_IPI_VECTOR 0xed - -/* - * IRQ work vector: - */ -#define IRQ_WORK_VECTOR 0xec - -#define UV_BAU_MESSAGE 0xea - -/* - * Self IPI vector for machine checks - */ -#define MCE_SELF_VECTOR 0xeb - -/* Xen vector callback to receive events in a HVM domain */ -#define XEN_HVM_EVTCHN_CALLBACK 0xe9 +/* f0-f7 used for spreading out TLB flushes: */ +#define NUM_INVALIDATE_TLB_VECTORS 8 +#define INVALIDATE_TLB_VECTOR_END 0xee +#define INVALIDATE_TLB_VECTOR_START \ + (INVALIDATE_TLB_VECTOR_END - NUM_INVALIDATE_TLB_VECTORS + 1) #define NR_VECTORS 256 From 3a09fb4570a1cce11472b8e5da3f6ee409f529d5 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 17 Jan 2011 10:52:05 +0800 Subject: [PATCH 233/706] x86: Allocate 32 tlb_invalidate_interrupt handler stubs Add up to 32 invalidate_interrupt handlers. How many handlers are added depends on NUM_INVALIDATE_TLB_VECTORS. So if NUM_INVALIDATE_TLB_VECTORS is smaller than 32, we reduce code size. Signed-off-by: Shaohua Li Cc: Andi Kleen Cc: Eric Dumazet LKML-Reference: <1295232725.1949.708.camel@sli10-conroe> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/entry_arch.h | 5 +- arch/x86/include/asm/hw_irq.h | 24 ++++++++++ arch/x86/kernel/entry_64.S | 5 +- arch/x86/kernel/irqinit.c | 79 +++++++++++++++++++++++++++---- 4 files changed, 103 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 57650ab4a5f5..1cd6d26a0a8d 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -16,10 +16,13 @@ BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) -.irpc idx, "01234567" +.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ + 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 +.if NUM_INVALIDATE_TLB_VECTORS > \idx BUILD_INTERRUPT3(invalidate_interrupt\idx, (INVALIDATE_TLB_VECTOR_START)+\idx, smp_invalidate_interrupt) +.endif .endr #endif diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 0274ec5a7e62..bb9efe8706e2 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -45,6 +45,30 @@ extern void invalidate_interrupt4(void); extern void invalidate_interrupt5(void); extern void invalidate_interrupt6(void); extern void invalidate_interrupt7(void); +extern void invalidate_interrupt8(void); +extern void invalidate_interrupt9(void); +extern void invalidate_interrupt10(void); +extern void invalidate_interrupt11(void); +extern void invalidate_interrupt12(void); +extern void invalidate_interrupt13(void); +extern void invalidate_interrupt14(void); +extern void invalidate_interrupt15(void); +extern void invalidate_interrupt16(void); +extern void invalidate_interrupt17(void); +extern void invalidate_interrupt18(void); +extern void invalidate_interrupt19(void); +extern void invalidate_interrupt20(void); +extern void invalidate_interrupt21(void); +extern void invalidate_interrupt22(void); +extern void invalidate_interrupt23(void); +extern void invalidate_interrupt24(void); +extern void invalidate_interrupt25(void); +extern void invalidate_interrupt26(void); +extern void invalidate_interrupt27(void); +extern void invalidate_interrupt28(void); +extern void invalidate_interrupt29(void); +extern void invalidate_interrupt30(void); +extern void invalidate_interrupt31(void); extern void irq_move_cleanup_interrupt(void); extern void reboot_interrupt(void); diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index aed1ffbeb0c9..891268c645ea 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -975,9 +975,12 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \ x86_platform_ipi smp_x86_platform_ipi #ifdef CONFIG_SMP -.irpc idx, "01234567" +.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ + 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 +.if NUM_INVALIDATE_TLB_VECTORS > \idx apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \ invalidate_interrupt\idx smp_invalidate_interrupt +.endif .endr #endif diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index c752e973958d..7aad10a63e04 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -164,14 +164,77 @@ static void __init smp_intr_init(void) alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); /* IPIs for invalidation */ - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); +#define ALLOC_INVTLB_VEC(NR) \ + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+NR, \ + invalidate_interrupt##NR) + + switch (NUM_INVALIDATE_TLB_VECTORS) { + default: + ALLOC_INVTLB_VEC(31); + case 31: + ALLOC_INVTLB_VEC(30); + case 30: + ALLOC_INVTLB_VEC(29); + case 29: + ALLOC_INVTLB_VEC(28); + case 28: + ALLOC_INVTLB_VEC(27); + case 27: + ALLOC_INVTLB_VEC(26); + case 26: + ALLOC_INVTLB_VEC(25); + case 25: + ALLOC_INVTLB_VEC(24); + case 24: + ALLOC_INVTLB_VEC(23); + case 23: + ALLOC_INVTLB_VEC(22); + case 22: + ALLOC_INVTLB_VEC(21); + case 21: + ALLOC_INVTLB_VEC(20); + case 20: + ALLOC_INVTLB_VEC(19); + case 19: + ALLOC_INVTLB_VEC(18); + case 18: + ALLOC_INVTLB_VEC(17); + case 17: + ALLOC_INVTLB_VEC(16); + case 16: + ALLOC_INVTLB_VEC(15); + case 15: + ALLOC_INVTLB_VEC(14); + case 14: + ALLOC_INVTLB_VEC(13); + case 13: + ALLOC_INVTLB_VEC(12); + case 12: + ALLOC_INVTLB_VEC(11); + case 11: + ALLOC_INVTLB_VEC(10); + case 10: + ALLOC_INVTLB_VEC(9); + case 9: + ALLOC_INVTLB_VEC(8); + case 8: + ALLOC_INVTLB_VEC(7); + case 7: + ALLOC_INVTLB_VEC(6); + case 6: + ALLOC_INVTLB_VEC(5); + case 5: + ALLOC_INVTLB_VEC(4); + case 4: + ALLOC_INVTLB_VEC(3); + case 3: + ALLOC_INVTLB_VEC(2); + case 2: + ALLOC_INVTLB_VEC(1); + case 1: + ALLOC_INVTLB_VEC(0); + break; + } /* IPI for generic function call */ alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); From 70e4a369733a21e3d16b059a6ccdad22a344bf57 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 17 Jan 2011 10:52:07 +0800 Subject: [PATCH 234/706] x86: Scale up the number of TLB invalidate vectors with NR_CPUs, up to 32 Make the maxium TLB invalidate vectors depend on NR_CPUS linearly, with a maximum of 32 vectors. We currently only have 8 vectors for TLB invalidation and that is clearly inadequate. If we have a lot of CPUs, the CPUs need share the 8 vectors and tlbstate_lock is used to protect them. flush_tlb_page() is heavily used in page reclaim, which will cause a lot of lock contention for tlbstate_lock. Andi Kleen suggested increasing the vectors number to 32, which should be good for current typical systems to reduce the tlbstate_lock contention. My test system has 4 sockets and 64G memory, and 64 CPUs. My workload creates 64 processes. Each process mmap reads a big empty sparse file. The total size of the files are 2*total_mem, so this will cause a lot of page reclaim. Below is the result I get from perf call-graph profiling: without the patch: ------------------ 24.25% usemem [kernel] [k] _raw_spin_lock | --- _raw_spin_lock | |--42.15%-- native_flush_tlb_others with the patch: ------------------ 14.96% usemem [kernel] [k] _raw_spin_lock | --- _raw_spin_lock |--13.89%-- native_flush_tlb_others So this heavily reduces the tlbstate_lock contention. Suggested-by: Andi Kleen Signed-off-by: Shaohua Li Cc: Eric Dumazet Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Peter Zijlstra LKML-Reference: <1295232727.1949.709.camel@sli10-conroe> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/irq_vectors.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 42f0d4a30f1b..4980f48bbbb7 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -17,8 +17,8 @@ * Vectors 0 ... 31 : system traps and exceptions - hardcoded events * Vectors 32 ... 127 : device interrupts * Vector 128 : legacy int80 syscall interface - * Vectors 129 ... 229 : device interrupts - * Vectors 230 ... 255 : special interrupts + * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 : device interrupts + * Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts * * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. * @@ -124,8 +124,13 @@ */ #define LOCAL_TIMER_VECTOR 0xef -/* f0-f7 used for spreading out TLB flushes: */ -#define NUM_INVALIDATE_TLB_VECTORS 8 +/* up to 32 vectors used for spreading out TLB flushes: */ +#if NR_CPUS <= 32 +# define NUM_INVALIDATE_TLB_VECTORS NR_CPUS +#else +# define NUM_INVALIDATE_TLB_VECTORS 32 +#endif + #define INVALIDATE_TLB_VECTOR_END 0xee #define INVALIDATE_TLB_VECTOR_START \ (INVALIDATE_TLB_VECTOR_END - NUM_INVALIDATE_TLB_VECTORS + 1) From 7064d865af804b9b841e7b9a3e9b653e40c3e5ca Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 17 Jan 2011 10:52:10 +0800 Subject: [PATCH 235/706] x86: Avoid tlbstate lock if not enough cpus This one isn't related to previous patch. If online cpus are below NUM_INVALIDATE_TLB_VECTORS, we don't need the lock. The comments in the code declares we don't need the check, but a hot lock still needs an atomic operation and expensive, so add the check here. Uses nr_cpu_ids here as suggested by Eric Dumazet. Signed-off-by: Shaohua Li Acked-by: Eric Dumazet Cc: Andi Kleen LKML-Reference: <1295232730.1949.710.camel@sli10-conroe> Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 6acc724d5d8f..55272d7c3b0b 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -179,12 +179,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, sender = this_cpu_read(tlb_vector_offset); f = &flush_state[sender]; - /* - * Could avoid this lock when - * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is - * probably not worth checking this for a cache-hot lock. - */ - raw_spin_lock(&f->tlbstate_lock); + if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS) + raw_spin_lock(&f->tlbstate_lock); f->flush_mm = mm; f->flush_va = va; @@ -202,7 +198,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, f->flush_mm = NULL; f->flush_va = 0; - raw_spin_unlock(&f->tlbstate_lock); + if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS) + raw_spin_unlock(&f->tlbstate_lock); } void native_flush_tlb_others(const struct cpumask *cpumask, From 77eed821accf5dd962b1f13bed0680e217e49112 Mon Sep 17 00:00:00 2001 From: Kamal Mostafa Date: Thu, 3 Feb 2011 17:38:04 -0800 Subject: [PATCH 236/706] x86: Fix panic when handling "mem={invalid}" param Avoid removing all of memory and panicing when "mem={invalid}" is specified, e.g. mem=blahblah, mem=0, or mem=nopentium (on platforms other than x86_32). Signed-off-by: Kamal Mostafa BugLink: http://bugs.launchpad.net/bugs/553464 Cc: Yinghai Lu Cc: Len Brown Cc: Rafael J. Wysocki Cc: # .3x: as far back as it applies LKML-Reference: <1296783486-23033-1-git-send-email-kamal@canonical.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 294f26da0c0c..55a59d889dbd 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -856,6 +856,9 @@ static int __init parse_memopt(char *p) userdef = 1; mem_size = memparse(p, &p); + /* don't remove all of memory when handling "mem={invalid}" param */ + if (mem_size == 0) + return -EINVAL; e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); return 0; From 9a6d44b9adb777ca9549e88cd55bd8f2673c52a2 Mon Sep 17 00:00:00 2001 From: Kamal Mostafa Date: Thu, 3 Feb 2011 17:38:05 -0800 Subject: [PATCH 237/706] x86: Emit "mem=nopentium ignored" warning when not supported Emit warning when "mem=nopentium" is specified on any arch other than x86_32 (the only that arch supports it). Signed-off-by: Kamal Mostafa BugLink: http://bugs.launchpad.net/bugs/553464 Cc: Yinghai Lu Cc: Len Brown Cc: Rafael J. Wysocki LKML-Reference: <1296783486-23033-2-git-send-email-kamal@canonical.com> Signed-off-by: Ingo Molnar Cc: --- arch/x86/kernel/e820.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 55a59d889dbd..0b5e2b546566 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -847,12 +847,15 @@ static int __init parse_memopt(char *p) if (!p) return -EINVAL; -#ifdef CONFIG_X86_32 if (!strcmp(p, "nopentium")) { +#ifdef CONFIG_X86_32 setup_clear_cpu_cap(X86_FEATURE_PSE); return 0; - } +#else + printk(KERN_WARNING "mem=nopentium ignored! (only supported on x86_32)\n"); + return -EINVAL; #endif + } userdef = 1; mem_size = memparse(p, &p); From e5fea868e6c04343e501176a373d568c1c0094aa Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 8 Feb 2011 23:22:17 -0800 Subject: [PATCH 238/706] x86: Fix and clean up generic_processor_info() One of the error printouts in generic_processor_info() prints out the APIC version instead of the cpu index the warning text describes. Move version validation down, after we get the right cpu index. -v2: add comments about reason why we can have cpu=0 there. Signed-off-by: Yinghai Lu LKML-Reference: <4D5240A9.4080703@kernel.org> [ Cleaned up and made the BIOS bug printouts more consistent ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 39 +++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 06c196d7e59c..628dcdb7afd5 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1925,17 +1925,6 @@ void __cpuinit generic_processor_info(int apicid, int version) { int cpu; - /* - * Validate version - */ - if (version == 0x0) { - pr_warning("BIOS bug, APIC version is 0 for CPU#%d! " - "fixing up to 0x10. (tell your hw vendor)\n", - version); - version = 0x10; - } - apic_version[apicid] = version; - if (num_processors >= nr_cpu_ids) { int max = nr_cpu_ids; int thiscpu = max + disabled_cpus; @@ -1949,22 +1938,34 @@ void __cpuinit generic_processor_info(int apicid, int version) } num_processors++; - cpu = cpumask_next_zero(-1, cpu_present_mask); - - if (version != apic_version[boot_cpu_physical_apicid]) - WARN_ONCE(1, - "ACPI: apic version mismatch, bootcpu: %x cpu %d: %x\n", - apic_version[boot_cpu_physical_apicid], cpu, version); - - physid_set(apicid, phys_cpu_present_map); if (apicid == boot_cpu_physical_apicid) { /* * x86_bios_cpu_apicid is required to have processors listed * in same order as logical cpu numbers. Hence the first * entry is BSP, and so on. + * boot_cpu_init() already hold bit 0 in cpu_present_mask + * for BSP. */ cpu = 0; + } else + cpu = cpumask_next_zero(-1, cpu_present_mask); + + /* + * Validate version + */ + if (version == 0x0) { + pr_warning("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n", + cpu, apicid); + version = 0x10; } + apic_version[apicid] = version; + + if (version != apic_version[boot_cpu_physical_apicid]) { + pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n", + apic_version[boot_cpu_physical_apicid], cpu, version); + } + + physid_set(apicid, phys_cpu_present_map); if (apicid > max_physical_apicid) max_physical_apicid = apicid; From 14392fd329eca9b59d51c0aa5d0acfb4965424d1 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 7 Feb 2011 14:08:53 -0800 Subject: [PATCH 239/706] x86, numa: Add error handling for bad cpu-to-node mappings CONFIG_DEBUG_PER_CPU_MAPS may return NUMA_NO_NODE when an early_cpu_to_node() mapping hasn't been initialized. In such a case, it emits a warning and continues without an issue but callers may try to use the return value to index into an array. We can catch those errors and fail silently since a warning has already been emitted. No current user of numa_add_cpu() requires this error checking to avoid a crash, but it's better to be proactive in case a future user happens to have a bug and a user tries to diagnose it with CONFIG_DEBUG_PER_CPU_MAPS. Reported-by: Jesper Juhl Signed-off-by: David Rientjes Cc: Tejun Heo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/mm/numa.c | 4 ++++ arch/x86/mm/numa_64.c | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index bf60715bd1b7..9559d360fde7 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -219,6 +219,10 @@ struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable) struct cpumask *mask; char buf[64]; + if (node == NUMA_NO_NODE) { + /* early_cpu_to_node() already emits a warning and trace */ + return NULL; + } mask = node_to_cpumask_map[node]; if (!mask) { pr_err("node_to_cpumask_map[%i] NULL\n", node); diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index f548fbf75f44..3f9411ed3cdc 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -709,6 +709,10 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable) struct cpumask *mask; int i; + if (node == NUMA_NO_NODE) { + /* early_cpu_to_node() already emits a warning and trace */ + return; + } for_each_online_node(i) { unsigned long addr; From 6b617e224dfac0b64ed70dacdac50be6eb78a6a1 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Tue, 15 Feb 2011 00:13:31 +0800 Subject: [PATCH 240/706] x86/platform: Add a wallclock_init func to x86_init.timers ops Some wall clock devices use MMIO based HW register, this new function will give them a chance to do some initialization work before their get/set_time service get called, which is usually in early kernel boot phase. Signed-off-by: Feng Tang Signed-off-by: Jacob Pan Signed-off-by: Alan Cox Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/setup.c | 2 ++ arch/x86/kernel/x86_init.c | 1 + 3 files changed, 5 insertions(+) diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 64642ad019fb..643ebf2e2ad8 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -83,11 +83,13 @@ struct x86_init_paging { * boot cpu * @tsc_pre_init: platform function called before TSC init * @timer_init: initialize the platform timer (default PIT/HPET) + * @wallclock_init: init the wallclock device */ struct x86_init_timers { void (*setup_percpu_clockev)(void); void (*tsc_pre_init)(void); void (*timer_init)(void); + void (*wallclock_init)(void); }; /** diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 21c6746338af..68d535a77df0 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1059,6 +1059,8 @@ void __init setup_arch(char **cmdline_p) #endif x86_init.oem.banner(); + x86_init.timers.wallclock_init(); + mcheck_init(); local_irq_save(flags); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index ceb2911aa439..c11514e9128b 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -70,6 +70,7 @@ struct x86_init_ops x86_init __initdata = { .setup_percpu_clockev = setup_boot_APIC_clock, .tsc_pre_init = x86_init_noop, .timer_init = hpet_time_init, + .wallclock_init = x86_init_noop, }, .iommu = { From 168202c7bf89d7a2abaf8deaf4bbed18a1f7b3a3 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Tue, 15 Feb 2011 00:13:32 +0800 Subject: [PATCH 241/706] mrst/vrtc: Avoid using cmos rtc ops If we don't assign Moorestown specific wallclock init and ops function the rtc/persisent clock code will use cmos rtc for access, this will crash Moorestown in that the ioports are not present. Also in vrtc driver, should avoid using cmos access to check UIP status. [feng.tang@intel.com: use set_fixmap_offset_nocache() to simplify code] Signed-off-by: Jacob Pan Signed-off-by: Feng Tang Signed-off-by: Alan Cox Signed-off-by: Thomas Gleixner --- arch/x86/platform/mrst/mrst.c | 2 ++ arch/x86/platform/mrst/vrtc.c | 16 ++++------------ drivers/rtc/rtc-mrst.c | 13 ++++++++++++- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index fee0b4914e07..4c542c757cb4 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -294,6 +295,7 @@ void __init x86_mrst_early_setup(void) x86_platform.calibrate_tsc = mrst_calibrate_tsc; x86_platform.i8042_detect = mrst_i8042_detect; + x86_init.timers.wallclock_init = mrst_rtc_init; x86_init.pci.init = pci_mrst_init; x86_init.pci.fixup_irqs = x86_init_noop; diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/mrst/vrtc.c index 32cd7edd71a0..04cf645feb92 100644 --- a/arch/x86/platform/mrst/vrtc.c +++ b/arch/x86/platform/mrst/vrtc.c @@ -100,22 +100,14 @@ int vrtc_set_mmss(unsigned long nowtime) void __init mrst_rtc_init(void) { - unsigned long rtc_paddr; - void __iomem *virt_base; + unsigned long vrtc_paddr = sfi_mrtc_array[0].phys_addr; sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); - if (!sfi_mrtc_num) + if (!sfi_mrtc_num || !vrtc_paddr) return; - rtc_paddr = sfi_mrtc_array[0].phys_addr; - - /* vRTC's register address may not be page aligned */ - set_fixmap_nocache(FIX_LNW_VRTC, rtc_paddr); - - virt_base = (void __iomem *)__fix_to_virt(FIX_LNW_VRTC); - virt_base += rtc_paddr & ~PAGE_MASK; - vrtc_virt_base = virt_base; - + vrtc_virt_base = (void __iomem *)set_fixmap_offset_nocache(FIX_LNW_VRTC, + vrtc_paddr); x86_platform.get_wallclock = vrtc_get_time; x86_platform.set_wallclock = vrtc_set_mmss; } diff --git a/drivers/rtc/rtc-mrst.c b/drivers/rtc/rtc-mrst.c index bcd0cf63eb16..28e02e7580f4 100644 --- a/drivers/rtc/rtc-mrst.c +++ b/drivers/rtc/rtc-mrst.c @@ -62,6 +62,17 @@ static inline int is_intr(u8 rtc_intr) return rtc_intr & RTC_IRQMASK; } +static inline unsigned char vrtc_is_updating(void) +{ + unsigned char uip; + unsigned long flags; + + spin_lock_irqsave(&rtc_lock, flags); + uip = (vrtc_cmos_read(RTC_FREQ_SELECT) & RTC_UIP); + spin_unlock_irqrestore(&rtc_lock, flags); + return uip; +} + /* * rtc_time's year contains the increment over 1900, but vRTC's YEAR * register can't be programmed to value larger than 0x64, so vRTC @@ -76,7 +87,7 @@ static int mrst_read_time(struct device *dev, struct rtc_time *time) { unsigned long flags; - if (rtc_is_updating()) + if (vrtc_is_updating()) mdelay(20); spin_lock_irqsave(&rtc_lock, flags); From 6ea96e7e4946f790330557e4b7c4c8a174c1c6d2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 6 Feb 2011 22:45:31 +0000 Subject: [PATCH 242/706] um: Remove stale irq_chip.end irq_chip.end got obsolete with the remnoval of __do_IRQ(). Signed-off-by: Thomas Gleixner Cc: Jeff Dike Cc: Peter Zijlstra Cc: Andrew Morton LKML-Reference: <20110206224515.135703209@linutronix.de> --- arch/um/kernel/irq.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index 3f0ac9e0c966..c9167485431e 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -374,7 +374,6 @@ static struct irq_chip normal_irq_type = { .disable = dummy, .enable = dummy, .ack = dummy, - .end = dummy }; static struct irq_chip SIGVTALRM_irq_type = { @@ -384,7 +383,6 @@ static struct irq_chip SIGVTALRM_irq_type = { .disable = dummy, .enable = dummy, .ack = dummy, - .end = dummy }; void __init init_IRQ(void) From 1d119aa06fb2b2608151a162f15c480d46694b65 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 6 Feb 2011 22:45:34 +0000 Subject: [PATCH 243/706] um: Convert irq_chips to new functions Signed-off-by: Thomas Gleixner Cc: Jeff Dike Cc: Andrew Morton LKML-Reference: <20110206224515.224027758@linutronix.de> --- arch/um/kernel/irq.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index c9167485431e..f771b85833e3 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -360,10 +360,10 @@ EXPORT_SYMBOL(um_request_irq); EXPORT_SYMBOL(reactivate_fd); /* - * irq_chip must define (startup || enable) && - * (shutdown || disable) && end + * irq_chip must define at least enable/disable and ack when + * the edge handler is used. */ -static void dummy(unsigned int irq) +static void dummy(struct irq_data *d) { } @@ -371,18 +371,17 @@ static void dummy(unsigned int irq) static struct irq_chip normal_irq_type = { .name = "SIGIO", .release = free_irq_by_irq_and_dev, - .disable = dummy, - .enable = dummy, - .ack = dummy, + .irq_disable = dummy, + .irq_enable = dummy, + .irq_ack = dummy, }; static struct irq_chip SIGVTALRM_irq_type = { .name = "SIGVTALRM", .release = free_irq_by_irq_and_dev, - .shutdown = dummy, /* never called */ - .disable = dummy, - .enable = dummy, - .ack = dummy, + .irq_disable = dummy, + .irq_enable = dummy, + .irq_ack = dummy, }; void __init init_IRQ(void) From d5b4eea1c575b78448fd5dc54d250ff302ca22f9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 6 Feb 2011 22:45:36 +0000 Subject: [PATCH 244/706] um: Use proper accessors in show_interrupts() Signed-off-by: Thomas Gleixner Cc: Jeff Dike Cc: Andrew Morton LKML-Reference: <20110206224515.322707425@linutronix.de> --- arch/um/kernel/irq.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index f771b85833e3..64cfea80cfe2 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -35,8 +35,10 @@ int show_interrupts(struct seq_file *p, void *v) } if (i < NR_IRQS) { - raw_spin_lock_irqsave(&irq_desc[i].lock, flags); - action = irq_desc[i].action; + struct irq_desc *desc = irq_to_desc(i); + + raw_spin_lock_irqsave(&desc->lock, flags); + action = desc->action; if (!action) goto skip; seq_printf(p, "%3d: ",i); @@ -46,7 +48,7 @@ int show_interrupts(struct seq_file *p, void *v) for_each_online_cpu(j) seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); #endif - seq_printf(p, " %14s", irq_desc[i].chip->name); + seq_printf(p, " %14s", get_irq_desc_chip(desc)->name); seq_printf(p, " %s", action->name); for (action=action->next; action; action = action->next) @@ -54,7 +56,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); skip: - raw_spin_unlock_irqrestore(&irq_desc[i].lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); } else if (i == NR_IRQS) seq_putc(p, '\n'); From 53c39ce56d203d80ba8217a16bb024b25185fb7e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 6 Feb 2011 22:45:38 +0000 Subject: [PATCH 245/706] um: Select GENERIC_HARDIRQS_NO_DEPRECATED irq chips converted and proper accessor functions used. Signed-off-by: Thomas Gleixner Cc: Jeff Dike Cc: Andrew Morton LKML-Reference: <20110206224515.430825903@linutronix.de> --- arch/um/Kconfig.common | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/um/Kconfig.common b/arch/um/Kconfig.common index e351e14b4339..1e78940218c0 100644 --- a/arch/um/Kconfig.common +++ b/arch/um/Kconfig.common @@ -7,6 +7,7 @@ config UML bool default y select HAVE_GENERIC_HARDIRQS + select GENERIC_HARDIRQS_NO_DEPRECATED config MMU bool From 7d36b7bc9022f35f95cd85cdf441846298e8f9fb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 12:13:06 +0100 Subject: [PATCH 246/706] x86-64, NUMA: Make dummy node initialization path similar to non-dummy ones Dummy node initialization in initmem_init() didn't initialize apicid to node mapping and set cpu to node mapping directly by caling numa_set_node(), which is different from non-dummy init paths. Update it such that they behave similarly. Initialize apicid to node mapping and call numa_init_array(). The actual cpu to node mapping is handled by init_cpu_to_node() later. Signed-off-by: Tejun Heo Acked-by: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/mm/numa_64.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 43ad3273561a..b7d78d7a2473 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -624,11 +624,12 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, memnodemap[0] = 0; node_set_online(0); node_set(0, node_possible_map); - for (i = 0; i < nr_cpu_ids; i++) - numa_set_node(i, 0); + for (i = 0; i < MAX_LOCAL_APIC; i++) + set_apicid_to_node(i, NUMA_NO_NODE); memblock_x86_register_active_regions(0, start_pfn, last_pfn); init_memory_mapping_high(); setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT); + numa_init_array(); } unsigned long __init numa_free_all_bootmem(void) From 13081df5dd6eae1951a3c398fa17d3ed2037a78f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 12:13:06 +0100 Subject: [PATCH 247/706] x86-64, NUMA: Simplify hotplug node handling in acpi_numa_memory_affinity_init() Hotplug node handling in acpi_numa_memory_affinity_init() was unnecessarily complicated with storing the original nodes[] entry and restoring it afterwards. Simplify it by not modifying the nodes[] entry for hotplug nodes from the beginning. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/mm/srat_64.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 23498f8b09a2..988b0b70ff39 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -251,7 +251,7 @@ update_nodes_add(int node, unsigned long start, unsigned long end) void __init acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) { - struct bootnode *nd, oldnode; + struct bootnode *nd; unsigned long start, end; int node, pxm; int i; @@ -289,28 +289,23 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) bad_srat(); return; } - nd = &nodes[node]; - oldnode = *nd; - if (!node_test_and_set(node, nodes_parsed)) { - nd->start = start; - nd->end = end; - } else { - if (start < nd->start) - nd->start = start; - if (nd->end < end) - nd->end = end; - } printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, start, end); - if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) { + if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)) { + nd = &nodes[node]; + if (!node_test_and_set(node, nodes_parsed)) { + nd->start = start; + nd->end = end; + } else { + if (start < nd->start) + nd->start = start; + if (nd->end < end) + nd->end = end; + } + } else update_nodes_add(node, start, end); - /* restore nodes[node] */ - *nd = oldnode; - if ((nd->start | nd->end) == 0) - node_clear(node, nodes_parsed); - } node_memblk_range[num_node_memblks].start = start; node_memblk_range[num_node_memblks].end = end; From 86ef4dbf1f736bb1a4d567e043e3dd81b8b7860c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 12:13:06 +0100 Subject: [PATCH 248/706] x86, NUMA: Drop @start/last_pfn from initmem_init() initmem_init() extensively accesses and modifies global data structures and the parameters aren't even followed depending on which path is being used. Drop @start/last_pfn and let it deal with @max_pfn directly. This is in preparation for further NUMA init cleanups. - v2: x86-32 initmem_init() weren't updated breaking 32bit builds. Fixed. Found by Yinghai. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/include/asm/page_types.h | 3 +-- arch/x86/kernel/setup.c | 2 +- arch/x86/mm/init_32.c | 3 +-- arch/x86/mm/init_64.c | 5 ++--- arch/x86/mm/numa_32.c | 3 +-- arch/x86/mm/numa_64.c | 21 ++++++++------------- 6 files changed, 14 insertions(+), 23 deletions(-) diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 731d211a1b20..eb9ed00355a8 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -56,8 +56,7 @@ extern unsigned long init_memory_mapping(unsigned long start, void init_memory_mapping_high(void); -extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn, - int acpi, int k8); +extern void initmem_init(int acpi, int k8); extern void free_initmem(void); #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ac909ba297b9..756d640723f9 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1003,7 +1003,7 @@ void __init setup_arch(char **cmdline_p) amd = !amd_numa_init(0, max_pfn); #endif - initmem_init(0, max_pfn, acpi, amd); + initmem_init(acpi, amd); memblock_find_dma_reserve(); dma32_reserve_bootmem(); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index c821074b7f0b..16adb6665603 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -644,8 +644,7 @@ void __init find_low_pfn_range(void) } #ifndef CONFIG_NEED_MULTIPLE_NODES -void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, - int acpi, int k8) +void __init initmem_init(int acpi, int k8) { #ifdef CONFIG_HIGHMEM highstart_pfn = highend_pfn = max_pfn; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 194f2732ab77..04cc027e5437 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -603,10 +603,9 @@ kernel_physical_mapping_init(unsigned long start, } #ifndef CONFIG_NUMA -void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, - int acpi, int k8) +void __init initmem_init(int acpi, int k8) { - memblock_x86_register_active_regions(0, start_pfn, end_pfn); + memblock_x86_register_active_regions(0, 0, max_pfn); init_memory_mapping_high(); } #endif diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 505bb04654b5..3249b374732f 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -352,8 +352,7 @@ static void init_remap_allocator(int nid) (ulong) node_remap_end_vaddr[nid]); } -void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, - int acpi, int k8) +void __init initmem_init(int acpi, int k8) { int nid; long kva_target_pfn; diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index b7d78d7a2473..d7e4aafd0759 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -579,8 +579,7 @@ static int __init numa_emulation(unsigned long start_pfn, } #endif /* CONFIG_NUMA_EMU */ -void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, - int acpi, int amd) +void __init initmem_init(int acpi, int amd) { int i; @@ -588,19 +587,16 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, nodes_clear(node_online_map); #ifdef CONFIG_NUMA_EMU - setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT, - acpi, amd); - if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd)) + setup_physnodes(0, max_pfn << PAGE_SHIFT, acpi, amd); + if (cmdline && !numa_emulation(0, max_pfn, acpi, amd)) return; - setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT, - acpi, amd); + setup_physnodes(0, max_pfn << PAGE_SHIFT, acpi, amd); nodes_clear(node_possible_map); nodes_clear(node_online_map); #endif #ifdef CONFIG_ACPI_NUMA - if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, - last_pfn << PAGE_SHIFT)) + if (!numa_off && acpi && !acpi_scan_nodes(0, max_pfn << PAGE_SHIFT)) return; nodes_clear(node_possible_map); nodes_clear(node_online_map); @@ -616,8 +612,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, numa_off ? "NUMA turned off" : "No NUMA configuration found"); printk(KERN_INFO "Faking a node at %016lx-%016lx\n", - start_pfn << PAGE_SHIFT, - last_pfn << PAGE_SHIFT); + 0LU, max_pfn << PAGE_SHIFT); /* setup dummy node covering all memory */ memnode_shift = 63; memnodemap = memnode.embedded_map; @@ -626,9 +621,9 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, node_set(0, node_possible_map); for (i = 0; i < MAX_LOCAL_APIC; i++) set_apicid_to_node(i, NUMA_NO_NODE); - memblock_x86_register_active_regions(0, start_pfn, last_pfn); + memblock_x86_register_active_regions(0, 0, max_pfn); init_memory_mapping_high(); - setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT); + setup_node_bootmem(0, 0, max_pfn << PAGE_SHIFT); numa_init_array(); } From 940fed2e79a15cf0d006c860d7811adbe5c19882 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 12:13:06 +0100 Subject: [PATCH 249/706] x86-64, NUMA: Unify {acpi|amd}_{numa_init|scan_nodes}() arguments and return values The functions used during NUMA initialization - *_numa_init() and *_scan_nodes() - have different arguments and return values. Unify them such that they all take no argument and return 0 on success and -errno on failure. This is in preparation for further NUMA init cleanups. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/include/asm/acpi.h | 2 +- arch/x86/include/asm/amd_nb.h | 2 +- arch/x86/kernel/setup.c | 4 ++-- arch/x86/mm/amdtopology_64.c | 18 +++++++++--------- arch/x86/mm/numa_64.c | 2 +- arch/x86/mm/srat_64.c | 4 ++-- drivers/acpi/numa.c | 9 ++++++--- 7 files changed, 22 insertions(+), 19 deletions(-) diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 211ca3f7fd16..4e5dff9e0b39 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -187,7 +187,7 @@ struct bootnode; extern int acpi_numa; extern void acpi_get_nodes(struct bootnode *physnodes, unsigned long start, unsigned long end); -extern int acpi_scan_nodes(unsigned long start, unsigned long end); +extern int acpi_scan_nodes(void); #define NR_NODE_MEMBLKS (MAX_NUMNODES*2) #ifdef CONFIG_NUMA_EMU diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 2b33c4df979f..dc3c6e34da1d 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -16,7 +16,7 @@ struct bootnode; extern int early_is_amd_nb(u32 value); extern int amd_cache_northbridges(void); extern void amd_flush_garts(void); -extern int amd_numa_init(unsigned long start_pfn, unsigned long end_pfn); +extern int amd_numa_init(void); extern int amd_scan_nodes(void); extern int amd_get_subcaches(int); extern int amd_set_subcaches(int, int); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 756d640723f9..96810a3c6003 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -995,12 +995,12 @@ void __init setup_arch(char **cmdline_p) /* * Parse SRAT to discover nodes. */ - acpi = acpi_numa_init(); + acpi = !acpi_numa_init(); #endif #ifdef CONFIG_AMD_NUMA if (!acpi) - amd = !amd_numa_init(0, max_pfn); + amd = !amd_numa_init(); #endif initmem_init(acpi, amd); diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c index 2523c3554de5..655ccffc6ee5 100644 --- a/arch/x86/mm/amdtopology_64.c +++ b/arch/x86/mm/amdtopology_64.c @@ -51,7 +51,7 @@ static __init int find_northbridge(void) return num; } - return -1; + return -ENOENT; } static __init void early_get_boot_cpu_id(void) @@ -69,17 +69,17 @@ static __init void early_get_boot_cpu_id(void) #endif } -int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) +int __init amd_numa_init(void) { - unsigned long start = PFN_PHYS(start_pfn); - unsigned long end = PFN_PHYS(end_pfn); + unsigned long start = PFN_PHYS(0); + unsigned long end = PFN_PHYS(max_pfn); unsigned numnodes; unsigned long prevbase; int i, nb, found = 0; u32 nodeid, reg; if (!early_pci_allowed()) - return -1; + return -EINVAL; nb = find_northbridge(); if (nb < 0) @@ -90,7 +90,7 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) reg = read_pci_config(0, nb, 0, 0x60); numnodes = ((reg >> 4) & 0xF) + 1; if (numnodes <= 1) - return -1; + return -ENOENT; pr_info("Number of physical nodes %d\n", numnodes); @@ -121,7 +121,7 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) if ((base >> 8) & 3 || (limit >> 8) & 3) { pr_err("Node %d using interleaving mode %lx/%lx\n", nodeid, (base >> 8) & 3, (limit >> 8) & 3); - return -1; + return -EINVAL; } if (node_isset(nodeid, nodes_parsed)) { pr_info("Node %d already present, skipping\n", @@ -160,7 +160,7 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) if (prevbase > base) { pr_err("Node map not sorted %lx,%lx\n", prevbase, base); - return -1; + return -EINVAL; } pr_info("Node %d MemBase %016lx Limit %016lx\n", @@ -177,7 +177,7 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) } if (!found) - return -1; + return -ENOENT; return 0; } diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index d7e4aafd0759..a083f515f004 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -596,7 +596,7 @@ void __init initmem_init(int acpi, int amd) #endif #ifdef CONFIG_ACPI_NUMA - if (!numa_off && acpi && !acpi_scan_nodes(0, max_pfn << PAGE_SHIFT)) + if (!numa_off && acpi && !acpi_scan_nodes()) return; nodes_clear(node_possible_map); nodes_clear(node_online_map); diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 988b0b70ff39..4f9dbf066ca4 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -359,7 +359,7 @@ void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start, #endif /* CONFIG_NUMA_EMU */ /* Use the information discovered above to actually set up the nodes. */ -int __init acpi_scan_nodes(unsigned long start, unsigned long end) +int __init acpi_scan_nodes(void) { int i; @@ -368,7 +368,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) /* First clean up the node list */ for (i = 0; i < MAX_NUMNODES; i++) - cutoff_node(i, start, end); + cutoff_node(i, 0, max_pfn << PAGE_SHIFT); /* * Join together blocks on the same node, holes between diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c index 5eb25eb3ea48..3b5c3189fd99 100644 --- a/drivers/acpi/numa.c +++ b/drivers/acpi/numa.c @@ -274,7 +274,7 @@ acpi_table_parse_srat(enum acpi_srat_type id, int __init acpi_numa_init(void) { - int ret = 0; + int cnt = 0; /* * Should not limit number with cpu num that is from NR_CPUS or nr_cpus= @@ -288,7 +288,7 @@ int __init acpi_numa_init(void) acpi_parse_x2apic_affinity, 0); acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY, acpi_parse_processor_affinity, 0); - ret = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, + cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, acpi_parse_memory_affinity, NR_NODE_MEMBLKS); } @@ -297,7 +297,10 @@ int __init acpi_numa_init(void) acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit); acpi_numa_arch_fixup(); - return ret; + + if (cnt <= 0) + return cnt ?: -ENOENT; + return 0; } int acpi_get_pxm(acpi_handle h) From a9aec56afac238e4ed3980bd10b22121b83866dd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 12:13:06 +0100 Subject: [PATCH 250/706] x86-64, NUMA: Wrap acpi_numa_init() so that failure can be indicated by return value Because of the way ACPI tables are parsed, the generic acpi_numa_init() couldn't return failure when error was detected by arch hooks. Instead, the failure state was recorded and later arch dependent init hook - acpi_scan_nodes() - would fail. Wrap acpi_numa_init() with x86_acpi_numa_init() so that failure can be indicated as return value immediately. This is in preparation for further NUMA init cleanups. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/include/asm/acpi.h | 1 + arch/x86/kernel/setup.c | 2 +- arch/x86/mm/srat_64.c | 10 ++++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 4e5dff9e0b39..06fb7865eff2 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -187,6 +187,7 @@ struct bootnode; extern int acpi_numa; extern void acpi_get_nodes(struct bootnode *physnodes, unsigned long start, unsigned long end); +extern int x86_acpi_numa_init(void); extern int acpi_scan_nodes(void); #define NR_NODE_MEMBLKS (MAX_NUMNODES*2) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 96810a3c6003..c9a139c3056b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -995,7 +995,7 @@ void __init setup_arch(char **cmdline_p) /* * Parse SRAT to discover nodes. */ - acpi = !acpi_numa_init(); + acpi = !x86_acpi_numa_init(); #endif #ifdef CONFIG_AMD_NUMA diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 4f9dbf066ca4..56b92635d87c 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -358,6 +358,16 @@ void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start, } #endif /* CONFIG_NUMA_EMU */ +int __init x86_acpi_numa_init(void) +{ + int ret; + + ret = acpi_numa_init(); + if (ret < 0) + return ret; + return srat_disabled() ? -EINVAL : 0; +} + /* Use the information discovered above to actually set up the nodes. */ int __init acpi_scan_nodes(void) { From d8fc3afc49bb226c20e37f48a4ddd493cd092837 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 12:13:06 +0100 Subject: [PATCH 251/706] x86, NUMA: Move *_numa_init() invocations into initmem_init() There's no reason for these to live in setup_arch(). Move them inside initmem_init(). - v2: x86-32 initmem_init() weren't updated breaking 32bit builds. Fixed. Found by Ankita. Signed-off-by: Tejun Heo Cc: Ankita Garg Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/include/asm/page_types.h | 2 +- arch/x86/kernel/setup.c | 16 +--------------- arch/x86/mm/init_32.c | 2 +- arch/x86/mm/init_64.c | 2 +- arch/x86/mm/numa_32.c | 2 +- arch/x86/mm/numa_64.c | 16 +++++++++++++++- 6 files changed, 20 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index eb9ed00355a8..97e6007e4edd 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -56,7 +56,7 @@ extern unsigned long init_memory_mapping(unsigned long start, void init_memory_mapping_high(void); -extern void initmem_init(int acpi, int k8); +extern void initmem_init(void); extern void free_initmem(void); #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c9a139c3056b..46e684f85b36 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -719,8 +719,6 @@ early_param("reservelow", parse_reservelow); void __init setup_arch(char **cmdline_p) { - int acpi = 0; - int amd = 0; unsigned long flags; #ifdef CONFIG_X86_32 @@ -991,19 +989,7 @@ void __init setup_arch(char **cmdline_p) early_acpi_boot_init(); -#ifdef CONFIG_ACPI_NUMA - /* - * Parse SRAT to discover nodes. - */ - acpi = !x86_acpi_numa_init(); -#endif - -#ifdef CONFIG_AMD_NUMA - if (!acpi) - amd = !amd_numa_init(); -#endif - - initmem_init(acpi, amd); + initmem_init(); memblock_find_dma_reserve(); dma32_reserve_bootmem(); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 16adb6665603..5d43fa5141c6 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -644,7 +644,7 @@ void __init find_low_pfn_range(void) } #ifndef CONFIG_NEED_MULTIPLE_NODES -void __init initmem_init(int acpi, int k8) +void __init initmem_init(void) { #ifdef CONFIG_HIGHMEM highstart_pfn = highend_pfn = max_pfn; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 04cc027e5437..4f1f461fc1e9 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -603,7 +603,7 @@ kernel_physical_mapping_init(unsigned long start, } #ifndef CONFIG_NUMA -void __init initmem_init(int acpi, int k8) +void __init initmem_init(void) { memblock_x86_register_active_regions(0, 0, max_pfn); init_memory_mapping_high(); diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 3249b374732f..bde3906420df 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -352,7 +352,7 @@ static void init_remap_allocator(int nid) (ulong) node_remap_end_vaddr[nid]); } -void __init initmem_init(int acpi, int k8) +void __init initmem_init(void) { int nid; long kva_target_pfn; diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index a083f515f004..656b0cffda63 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -579,10 +580,23 @@ static int __init numa_emulation(unsigned long start_pfn, } #endif /* CONFIG_NUMA_EMU */ -void __init initmem_init(int acpi, int amd) +void __init initmem_init(void) { + int acpi = 0, amd = 0; int i; +#ifdef CONFIG_ACPI_NUMA + /* + * Parse SRAT to discover nodes. + */ + acpi = !x86_acpi_numa_init(); +#endif + +#ifdef CONFIG_AMD_NUMA + if (!acpi) + amd = !amd_numa_init(); +#endif + nodes_clear(node_possible_map); nodes_clear(node_online_map); From ffe77a4605fb2588f8666850ad3e3b196241658f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 12:13:06 +0100 Subject: [PATCH 252/706] x86-64, NUMA: Restructure initmem_init() Reorganize initmem_init() such that, * Different NUMA init methods are iterated in a consistent way. * Each iteration re-initializes all the parameters and different method can be tried after a failure. * Dummy init is handled the same as other methods. Apart from how retry after failure, this patch doesn't change the behavior. The call sequences are kept equivalent across the conversion. After the change, bad_srat() doesn't need to clear apic to node mapping or worry about numa_off. Simplified accordingly. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/mm/numa_64.c | 94 +++++++++++++++++++++++-------------------- arch/x86/mm/srat_64.c | 4 +- 2 files changed, 52 insertions(+), 46 deletions(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 656b0cffda63..c984e3431cc7 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -580,65 +580,73 @@ static int __init numa_emulation(unsigned long start_pfn, } #endif /* CONFIG_NUMA_EMU */ -void __init initmem_init(void) +static int dummy_numa_init(void) { - int acpi = 0, amd = 0; - int i; + return 0; +} -#ifdef CONFIG_ACPI_NUMA - /* - * Parse SRAT to discover nodes. - */ - acpi = !x86_acpi_numa_init(); -#endif - -#ifdef CONFIG_AMD_NUMA - if (!acpi) - amd = !amd_numa_init(); -#endif - - nodes_clear(node_possible_map); - nodes_clear(node_online_map); - -#ifdef CONFIG_NUMA_EMU - setup_physnodes(0, max_pfn << PAGE_SHIFT, acpi, amd); - if (cmdline && !numa_emulation(0, max_pfn, acpi, amd)) - return; - setup_physnodes(0, max_pfn << PAGE_SHIFT, acpi, amd); - nodes_clear(node_possible_map); - nodes_clear(node_online_map); -#endif - -#ifdef CONFIG_ACPI_NUMA - if (!numa_off && acpi && !acpi_scan_nodes()) - return; - nodes_clear(node_possible_map); - nodes_clear(node_online_map); -#endif - -#ifdef CONFIG_AMD_NUMA - if (!numa_off && amd && !amd_scan_nodes()) - return; - nodes_clear(node_possible_map); - nodes_clear(node_online_map); -#endif +static int dummy_scan_nodes(void) +{ printk(KERN_INFO "%s\n", numa_off ? "NUMA turned off" : "No NUMA configuration found"); - printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 0LU, max_pfn << PAGE_SHIFT); + /* setup dummy node covering all memory */ memnode_shift = 63; memnodemap = memnode.embedded_map; memnodemap[0] = 0; node_set_online(0); node_set(0, node_possible_map); - for (i = 0; i < MAX_LOCAL_APIC; i++) - set_apicid_to_node(i, NUMA_NO_NODE); memblock_x86_register_active_regions(0, 0, max_pfn); init_memory_mapping_high(); setup_node_bootmem(0, 0, max_pfn << PAGE_SHIFT); numa_init_array(); + + return 0; +} + +void __init initmem_init(void) +{ + int (*numa_init[])(void) = { [2] = dummy_numa_init }; + int (*scan_nodes[])(void) = { [2] = dummy_scan_nodes }; + int i, j; + + if (!numa_off) { +#ifdef CONFIG_ACPI_NUMA + numa_init[0] = x86_acpi_numa_init; + scan_nodes[0] = acpi_scan_nodes; +#endif +#ifdef CONFIG_AMD_NUMA + numa_init[1] = amd_numa_init; + scan_nodes[1] = amd_scan_nodes; +#endif + } + + for (i = 0; i < ARRAY_SIZE(numa_init); i++) { + if (!numa_init[i]) + continue; + + for (j = 0; j < MAX_LOCAL_APIC; j++) + set_apicid_to_node(j, NUMA_NO_NODE); + + nodes_clear(node_possible_map); + nodes_clear(node_online_map); + + if (numa_init[i]() < 0) + continue; +#ifdef CONFIG_NUMA_EMU + setup_physnodes(0, max_pfn << PAGE_SHIFT, i == 0, i == 1); + if (cmdline && !numa_emulation(0, max_pfn, i == 0, i == 1)) + return; + setup_physnodes(0, max_pfn << PAGE_SHIFT, i == 0, i == 1); + nodes_clear(node_possible_map); + nodes_clear(node_online_map); +#endif + if (!scan_nodes[i]()) + return; + } + BUG(); } unsigned long __init numa_free_all_bootmem(void) diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 56b92635d87c..597e011cfb51 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -78,8 +78,6 @@ static __init void bad_srat(void) int i; printk(KERN_ERR "SRAT: SRAT not used.\n"); acpi_numa = -1; - for (i = 0; i < MAX_LOCAL_APIC; i++) - set_apicid_to_node(i, NUMA_NO_NODE); for (i = 0; i < MAX_NUMNODES; i++) { nodes[i].start = nodes[i].end = 0; nodes_add[i].start = nodes_add[i].end = 0; @@ -89,7 +87,7 @@ static __init void bad_srat(void) static __init inline int srat_disabled(void) { - return numa_off || acpi_numa < 0; + return acpi_numa < 0; } /* Callback for SLIT parsing */ From ec8cf29b1d39aeb6ef98bc589f0c9a33a8f94c49 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 12:13:07 +0100 Subject: [PATCH 253/706] x86-64, NUMA: Use common {cpu|mem}_nodes_parsed ACPI and amd are using separate nodes_parsed masks. Add {cpu|mem}_nodes_parsed and use them in all NUMA init methods. Initialization of the masks and building node_possible_map are now handled commonly by initmem_init(). dummy_numa_init() is updated to set node 0 on both masks. While at it, move the info messages from scan to init. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/include/asm/numa_64.h | 3 +++ arch/x86/mm/amdtopology_64.c | 10 ++++------ arch/x86/mm/numa_64.c | 25 ++++++++++++++++++------- arch/x86/mm/srat_64.c | 17 ++++++----------- 4 files changed, 31 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 2819afa33632..de459365d52b 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -27,6 +27,9 @@ extern void setup_node_bootmem(int nodeid, unsigned long start, */ #define NODE_MIN_SIZE (4*1024*1024) +extern nodemask_t cpu_nodes_parsed __initdata; +extern nodemask_t mem_nodes_parsed __initdata; + extern int __cpuinit numa_cpu_node(int cpu); #ifdef CONFIG_NUMA_EMU diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c index 655ccffc6ee5..4f822a247125 100644 --- a/arch/x86/mm/amdtopology_64.c +++ b/arch/x86/mm/amdtopology_64.c @@ -28,7 +28,6 @@ static struct bootnode __initdata nodes[8]; static unsigned char __initdata nodeids[8]; -static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE; static __init int find_northbridge(void) { @@ -123,7 +122,7 @@ int __init amd_numa_init(void) nodeid, (base >> 8) & 3, (limit >> 8) & 3); return -EINVAL; } - if (node_isset(nodeid, nodes_parsed)) { + if (node_isset(nodeid, mem_nodes_parsed)) { pr_info("Node %d already present, skipping\n", nodeid); continue; @@ -173,7 +172,8 @@ int __init amd_numa_init(void) prevbase = base; - node_set(nodeid, nodes_parsed); + node_set(nodeid, mem_nodes_parsed); + node_set(nodeid, cpu_nodes_parsed); } if (!found) @@ -190,7 +190,7 @@ void __init amd_get_nodes(struct bootnode *physnodes) { int i; - for_each_node_mask(i, nodes_parsed) { + for_each_node_mask(i, mem_nodes_parsed) { physnodes[i].start = nodes[i].start; physnodes[i].end = nodes[i].end; } @@ -258,8 +258,6 @@ int __init amd_scan_nodes(void) unsigned int apicid_base; int i; - BUG_ON(nodes_empty(nodes_parsed)); - node_possible_map = nodes_parsed; memnode_shift = compute_hash_shift(nodes, 8, NULL); if (memnode_shift < 0) { pr_err("No NUMA node hash function found. Contact maintainer\n"); diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index c984e3431cc7..4404e1d649ac 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -25,6 +25,9 @@ struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); +nodemask_t cpu_nodes_parsed __initdata; +nodemask_t mem_nodes_parsed __initdata; + struct memnode memnode; static unsigned long __initdata nodemap_addr; @@ -581,23 +584,24 @@ static int __init numa_emulation(unsigned long start_pfn, #endif /* CONFIG_NUMA_EMU */ static int dummy_numa_init(void) -{ - return 0; -} - -static int dummy_scan_nodes(void) { printk(KERN_INFO "%s\n", numa_off ? "NUMA turned off" : "No NUMA configuration found"); printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 0LU, max_pfn << PAGE_SHIFT); + node_set(0, cpu_nodes_parsed); + node_set(0, mem_nodes_parsed); + + return 0; +} + +static int dummy_scan_nodes(void) +{ /* setup dummy node covering all memory */ memnode_shift = 63; memnodemap = memnode.embedded_map; memnodemap[0] = 0; - node_set_online(0); - node_set(0, node_possible_map); memblock_x86_register_active_regions(0, 0, max_pfn); init_memory_mapping_high(); setup_node_bootmem(0, 0, max_pfn << PAGE_SHIFT); @@ -630,6 +634,8 @@ void __init initmem_init(void) for (j = 0; j < MAX_LOCAL_APIC; j++) set_apicid_to_node(j, NUMA_NO_NODE); + nodes_clear(cpu_nodes_parsed); + nodes_clear(mem_nodes_parsed); nodes_clear(node_possible_map); nodes_clear(node_online_map); @@ -643,6 +649,11 @@ void __init initmem_init(void) nodes_clear(node_possible_map); nodes_clear(node_online_map); #endif + /* Account for nodes with cpus and no memory */ + nodes_or(node_possible_map, mem_nodes_parsed, cpu_nodes_parsed); + if (WARN_ON(nodes_empty(node_possible_map))) + continue; + if (!scan_nodes[i]()) return; } diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 597e011cfb51..33e72ec4fa4c 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -28,8 +28,6 @@ int acpi_numa __initdata; static struct acpi_table_slit *acpi_slit; -static nodemask_t nodes_parsed __initdata; -static nodemask_t cpu_nodes_parsed __initdata; static struct bootnode nodes[MAX_NUMNODES] __initdata; static struct bootnode nodes_add[MAX_NUMNODES]; @@ -293,7 +291,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)) { nd = &nodes[node]; - if (!node_test_and_set(node, nodes_parsed)) { + if (!node_test_and_set(node, mem_nodes_parsed)) { nd->start = start; nd->end = end; } else { @@ -319,7 +317,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes) unsigned long pxmram, e820ram; pxmram = 0; - for_each_node_mask(i, nodes_parsed) { + for_each_node_mask(i, mem_nodes_parsed) { unsigned long s = nodes[i].start >> PAGE_SHIFT; unsigned long e = nodes[i].end >> PAGE_SHIFT; pxmram += e - s; @@ -348,7 +346,7 @@ void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start, { int i; - for_each_node_mask(i, nodes_parsed) { + for_each_node_mask(i, mem_nodes_parsed) { cutoff_node(i, start, end); physnodes[i].start = nodes[i].start; physnodes[i].end = nodes[i].end; @@ -449,9 +447,6 @@ int __init acpi_scan_nodes(void) init_memory_mapping_high(); - /* Account for nodes with cpus and no memory */ - nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed); - /* Finally register nodes */ for_each_node_mask(i, node_possible_map) setup_node_bootmem(i, nodes[i].start, nodes[i].end); @@ -485,7 +480,7 @@ static int __init find_node_by_addr(unsigned long addr) int ret = NUMA_NO_NODE; int i; - for_each_node_mask(i, nodes_parsed) { + for_each_node_mask(i, mem_nodes_parsed) { /* * Find the real node that this emulated node appears on. For * the sake of simplicity, we only use a real node's starting @@ -545,10 +540,10 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i); memcpy(__apicid_to_node, fake_apicid_to_node, sizeof(__apicid_to_node)); - nodes_clear(nodes_parsed); + nodes_clear(mem_nodes_parsed); for (i = 0; i < num_nodes; i++) if (fake_nodes[i].start != fake_nodes[i].end) - node_set(i, nodes_parsed); + node_set(i, mem_nodes_parsed); } static int null_slit_node_compare(int a, int b) From 99df738cd28cc39054cd1a77685d4a94ed2193a4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 12:13:07 +0100 Subject: [PATCH 254/706] x86-64, NUMA: Remove local variable found from amd_numa_init() Use weight count on mem_nodes_parsed instead. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/mm/amdtopology_64.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c index 4f822a247125..7d85cf7e0324 100644 --- a/arch/x86/mm/amdtopology_64.c +++ b/arch/x86/mm/amdtopology_64.c @@ -74,7 +74,7 @@ int __init amd_numa_init(void) unsigned long end = PFN_PHYS(max_pfn); unsigned numnodes; unsigned long prevbase; - int i, nb, found = 0; + int i, nb; u32 nodeid, reg; if (!early_pci_allowed()) @@ -165,8 +165,6 @@ int __init amd_numa_init(void) pr_info("Node %d MemBase %016lx Limit %016lx\n", nodeid, base, limit); - found++; - nodes[nodeid].start = base; nodes[nodeid].end = limit; @@ -176,7 +174,7 @@ int __init amd_numa_init(void) node_set(nodeid, cpu_nodes_parsed); } - if (!found) + if (!nodes_weight(mem_nodes_parsed)) return -ENOENT; return 0; } From 45fe6c78c4ccc384044d1b4877eebe7acf359e76 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 12:13:07 +0100 Subject: [PATCH 255/706] x86-64, NUMA: Move apicid to numa mapping initialization from amd_scan_nodes() to amd_numa_init() This brings amd initialization behavior closer to that of acpi. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/mm/amdtopology_64.c | 43 +++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c index 7d85cf7e0324..b6029a6b129f 100644 --- a/arch/x86/mm/amdtopology_64.c +++ b/arch/x86/mm/amdtopology_64.c @@ -74,8 +74,9 @@ int __init amd_numa_init(void) unsigned long end = PFN_PHYS(max_pfn); unsigned numnodes; unsigned long prevbase; - int i, nb; + int i, j, nb; u32 nodeid, reg; + unsigned int bits, cores, apicid_base; if (!early_pci_allowed()) return -EINVAL; @@ -176,6 +177,26 @@ int __init amd_numa_init(void) if (!nodes_weight(mem_nodes_parsed)) return -ENOENT; + + /* + * We seem to have valid NUMA configuration. Map apicids to nodes + * using the coreid bits from early_identify_cpu. + */ + bits = boot_cpu_data.x86_coreid_bits; + cores = 1 << bits; + apicid_base = 0; + + /* get the APIC ID of the BSP early for systems with apicid lifting */ + early_get_boot_cpu_id(); + if (boot_cpu_physical_apicid > 0) { + pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid); + apicid_base = boot_cpu_physical_apicid; + } + + for_each_node_mask(i, cpu_nodes_parsed) + for (j = apicid_base; j < cores + apicid_base; j++) + set_apicid_to_node((i << bits) + j, i); + return 0; } @@ -251,9 +272,6 @@ void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes) int __init amd_scan_nodes(void) { - unsigned int bits; - unsigned int cores; - unsigned int apicid_base; int i; memnode_shift = compute_hash_shift(nodes, 8, NULL); @@ -264,28 +282,13 @@ int __init amd_scan_nodes(void) pr_info("Using node hash shift of %d\n", memnode_shift); /* use the coreid bits from early_identify_cpu */ - bits = boot_cpu_data.x86_coreid_bits; - cores = (1< 0) { - pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid); - apicid_base = boot_cpu_physical_apicid; - } - for_each_node_mask(i, node_possible_map) memblock_x86_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, nodes[i].end >> PAGE_SHIFT); init_memory_mapping_high(); - for_each_node_mask(i, node_possible_map) { - int j; - - for (j = apicid_base; j < cores + apicid_base; j++) - set_apicid_to_node((i << bits) + j, i); + for_each_node_mask(i, node_possible_map) setup_node_bootmem(i, nodes[i].start, nodes[i].end); - } numa_init_array(); return 0; From 206e42087a037fa3adca8908fd318a0cb64d4dee Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 12:13:07 +0100 Subject: [PATCH 256/706] x86-64, NUMA: Use common numa_nodes[] ACPI and amd are using separate nodes[] array. Add numa_nodes[] and use them in all NUMA init methods. cutoff_node() cleanup is moved from srat_64.c to numa_64.c and applied in initmem_init() regardless of init methods. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/include/asm/numa_64.h | 1 + arch/x86/mm/amdtopology_64.c | 19 +++++++-------- arch/x86/mm/numa_64.c | 24 +++++++++++++++++++ arch/x86/mm/srat_64.c | 43 +++++++++------------------------- 4 files changed, 45 insertions(+), 42 deletions(-) diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index de459365d52b..d3a45147d09b 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -29,6 +29,7 @@ extern void setup_node_bootmem(int nodeid, unsigned long start, extern nodemask_t cpu_nodes_parsed __initdata; extern nodemask_t mem_nodes_parsed __initdata; +extern struct bootnode numa_nodes[MAX_NUMNODES] __initdata; extern int __cpuinit numa_cpu_node(int cpu); diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c index b6029a6b129f..f049fa67ed73 100644 --- a/arch/x86/mm/amdtopology_64.c +++ b/arch/x86/mm/amdtopology_64.c @@ -26,7 +26,6 @@ #include #include -static struct bootnode __initdata nodes[8]; static unsigned char __initdata nodeids[8]; static __init int find_northbridge(void) @@ -166,8 +165,8 @@ int __init amd_numa_init(void) pr_info("Node %d MemBase %016lx Limit %016lx\n", nodeid, base, limit); - nodes[nodeid].start = base; - nodes[nodeid].end = limit; + numa_nodes[nodeid].start = base; + numa_nodes[nodeid].end = limit; prevbase = base; @@ -210,8 +209,8 @@ void __init amd_get_nodes(struct bootnode *physnodes) int i; for_each_node_mask(i, mem_nodes_parsed) { - physnodes[i].start = nodes[i].start; - physnodes[i].end = nodes[i].end; + physnodes[i].start = numa_nodes[i].start; + physnodes[i].end = numa_nodes[i].end; } } @@ -221,7 +220,7 @@ static int __init find_node_by_addr(unsigned long addr) int i; for (i = 0; i < 8; i++) - if (addr >= nodes[i].start && addr < nodes[i].end) { + if (addr >= numa_nodes[i].start && addr < numa_nodes[i].end) { ret = i; break; } @@ -274,7 +273,7 @@ int __init amd_scan_nodes(void) { int i; - memnode_shift = compute_hash_shift(nodes, 8, NULL); + memnode_shift = compute_hash_shift(numa_nodes, 8, NULL); if (memnode_shift < 0) { pr_err("No NUMA node hash function found. Contact maintainer\n"); return -1; @@ -284,11 +283,11 @@ int __init amd_scan_nodes(void) /* use the coreid bits from early_identify_cpu */ for_each_node_mask(i, node_possible_map) memblock_x86_register_active_regions(i, - nodes[i].start >> PAGE_SHIFT, - nodes[i].end >> PAGE_SHIFT); + numa_nodes[i].start >> PAGE_SHIFT, + numa_nodes[i].end >> PAGE_SHIFT); init_memory_mapping_high(); for_each_node_mask(i, node_possible_map) - setup_node_bootmem(i, nodes[i].start, nodes[i].end); + setup_node_bootmem(i, numa_nodes[i].start, numa_nodes[i].end); numa_init_array(); return 0; diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 4404e1d649ac..a6b899f7ddd2 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -33,6 +33,8 @@ struct memnode memnode; static unsigned long __initdata nodemap_addr; static unsigned long __initdata nodemap_size; +struct bootnode numa_nodes[MAX_NUMNODES] __initdata; + /* * Given a shift value, try to populate memnodemap[] * Returns : @@ -182,6 +184,22 @@ static void * __init early_node_mem(int nodeid, unsigned long start, return NULL; } +static __init void cutoff_node(int i, unsigned long start, unsigned long end) +{ + struct bootnode *nd = &numa_nodes[i]; + + if (nd->start < start) { + nd->start = start; + if (nd->end < nd->start) + nd->start = nd->end; + } + if (nd->end > end) { + nd->end = end; + if (nd->start > nd->end) + nd->start = nd->end; + } +} + /* Initialize bootmem allocator for a node */ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) @@ -638,9 +656,15 @@ void __init initmem_init(void) nodes_clear(mem_nodes_parsed); nodes_clear(node_possible_map); nodes_clear(node_online_map); + memset(numa_nodes, 0, sizeof(numa_nodes)); if (numa_init[i]() < 0) continue; + + /* clean up the node list */ + for (j = 0; j < MAX_NUMNODES; j++) + cutoff_node(j, 0, max_pfn << PAGE_SHIFT); + #ifdef CONFIG_NUMA_EMU setup_physnodes(0, max_pfn << PAGE_SHIFT, i == 0, i == 1); if (cmdline && !numa_emulation(0, max_pfn, i == 0, i == 1)) diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 33e72ec4fa4c..bfa4a6af5cfe 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -28,7 +28,6 @@ int acpi_numa __initdata; static struct acpi_table_slit *acpi_slit; -static struct bootnode nodes[MAX_NUMNODES] __initdata; static struct bootnode nodes_add[MAX_NUMNODES]; static int num_node_memblks __initdata; @@ -55,29 +54,13 @@ static __init int conflicting_memblks(unsigned long start, unsigned long end) return -1; } -static __init void cutoff_node(int i, unsigned long start, unsigned long end) -{ - struct bootnode *nd = &nodes[i]; - - if (nd->start < start) { - nd->start = start; - if (nd->end < nd->start) - nd->start = nd->end; - } - if (nd->end > end) { - nd->end = end; - if (nd->start > nd->end) - nd->start = nd->end; - } -} - static __init void bad_srat(void) { int i; printk(KERN_ERR "SRAT: SRAT not used.\n"); acpi_numa = -1; for (i = 0; i < MAX_NUMNODES; i++) { - nodes[i].start = nodes[i].end = 0; + numa_nodes[i].start = numa_nodes[i].end = 0; nodes_add[i].start = nodes_add[i].end = 0; } remove_all_active_ranges(); @@ -276,12 +259,12 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) if (i == node) { printk(KERN_WARNING "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n", - pxm, start, end, nodes[i].start, nodes[i].end); + pxm, start, end, numa_nodes[i].start, numa_nodes[i].end); } else if (i >= 0) { printk(KERN_ERR "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n", pxm, start, end, node_to_pxm(i), - nodes[i].start, nodes[i].end); + numa_nodes[i].start, numa_nodes[i].end); bad_srat(); return; } @@ -290,7 +273,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) start, end); if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)) { - nd = &nodes[node]; + nd = &numa_nodes[node]; if (!node_test_and_set(node, mem_nodes_parsed)) { nd->start = start; nd->end = end; @@ -347,9 +330,8 @@ void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start, int i; for_each_node_mask(i, mem_nodes_parsed) { - cutoff_node(i, start, end); - physnodes[i].start = nodes[i].start; - physnodes[i].end = nodes[i].end; + physnodes[i].start = numa_nodes[i].start; + physnodes[i].end = numa_nodes[i].end; } } #endif /* CONFIG_NUMA_EMU */ @@ -372,10 +354,6 @@ int __init acpi_scan_nodes(void) if (acpi_numa <= 0) return -1; - /* First clean up the node list */ - for (i = 0; i < MAX_NUMNODES; i++) - cutoff_node(i, 0, max_pfn << PAGE_SHIFT); - /* * Join together blocks on the same node, holes between * which don't overlap with memory on other nodes. @@ -440,7 +418,7 @@ int __init acpi_scan_nodes(void) /* for out of order entries in SRAT */ sort_node_map(); - if (!nodes_cover_memory(nodes)) { + if (!nodes_cover_memory(numa_nodes)) { bad_srat(); return -1; } @@ -449,12 +427,13 @@ int __init acpi_scan_nodes(void) /* Finally register nodes */ for_each_node_mask(i, node_possible_map) - setup_node_bootmem(i, nodes[i].start, nodes[i].end); + setup_node_bootmem(i, numa_nodes[i].start, numa_nodes[i].end); /* Try again in case setup_node_bootmem missed one due to missing bootmem */ for_each_node_mask(i, node_possible_map) if (!node_online(i)) - setup_node_bootmem(i, nodes[i].start, nodes[i].end); + setup_node_bootmem(i, numa_nodes[i].start, + numa_nodes[i].end); for (i = 0; i < nr_cpu_ids; i++) { int node = early_cpu_to_node(i); @@ -486,7 +465,7 @@ static int __init find_node_by_addr(unsigned long addr) * the sake of simplicity, we only use a real node's starting * address to determine which emulated node it appears on. */ - if (addr >= nodes[i].start && addr < nodes[i].end) { + if (addr >= numa_nodes[i].start && addr < numa_nodes[i].end) { ret = i; break; } From 19095548704ecd0f32fd5deba01d56430ad7a344 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 12:13:07 +0100 Subject: [PATCH 257/706] x86-64, NUMA: Kill {acpi|amd}_get_nodes() With common numa_nodes[], common code in numa_64.c can access it directly. Copy directly and kill {acpi|amd}_get_nodes(). Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/include/asm/acpi.h | 2 -- arch/x86/include/asm/amd_nb.h | 1 - arch/x86/mm/amdtopology_64.c | 10 ---------- arch/x86/mm/numa_64.c | 23 ++++++++++------------- arch/x86/mm/srat_64.c | 13 ------------- 5 files changed, 10 insertions(+), 39 deletions(-) diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 06fb7865eff2..446a5b9a1d5f 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -185,8 +185,6 @@ struct bootnode; #ifdef CONFIG_ACPI_NUMA extern int acpi_numa; -extern void acpi_get_nodes(struct bootnode *physnodes, unsigned long start, - unsigned long end); extern int x86_acpi_numa_init(void); extern int acpi_scan_nodes(void); #define NR_NODE_MEMBLKS (MAX_NUMNODES*2) diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index dc3c6e34da1d..246cdc6ca9ba 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -23,7 +23,6 @@ extern int amd_set_subcaches(int, int); #ifdef CONFIG_NUMA_EMU extern void amd_fake_nodes(const struct bootnode *nodes, int nr_nodes); -extern void amd_get_nodes(struct bootnode *nodes); #endif struct amd_northbridge { diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c index f049fa67ed73..cf29527885f8 100644 --- a/arch/x86/mm/amdtopology_64.c +++ b/arch/x86/mm/amdtopology_64.c @@ -204,16 +204,6 @@ static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; -void __init amd_get_nodes(struct bootnode *physnodes) -{ - int i; - - for_each_node_mask(i, mem_nodes_parsed) { - physnodes[i].start = numa_nodes[i].start; - physnodes[i].end = numa_nodes[i].end; - } -} - static int __init find_node_by_addr(unsigned long addr) { int ret = NUMA_NO_NODE; diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index a6b899f7ddd2..82ee3083b094 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -257,21 +257,18 @@ void __init numa_emu_cmdline(char *str) cmdline = str; } -static int __init setup_physnodes(unsigned long start, unsigned long end, - int acpi, int amd) +static int __init setup_physnodes(unsigned long start, unsigned long end) { int ret = 0; int i; memset(physnodes, 0, sizeof(physnodes)); -#ifdef CONFIG_ACPI_NUMA - if (acpi) - acpi_get_nodes(physnodes, start, end); -#endif -#ifdef CONFIG_AMD_NUMA - if (amd) - amd_get_nodes(physnodes); -#endif + + for_each_node_mask(i, mem_nodes_parsed) { + physnodes[i].start = numa_nodes[i].start; + physnodes[i].end = numa_nodes[i].end; + } + /* * Basic sanity checking on the physical node map: there may be errors * if the SRAT or AMD code incorrectly reported the topology or the mem= @@ -594,7 +591,7 @@ static int __init numa_emulation(unsigned long start_pfn, init_memory_mapping_high(); for_each_node_mask(i, node_possible_map) setup_node_bootmem(i, nodes[i].start, nodes[i].end); - setup_physnodes(addr, max_addr, acpi, amd); + setup_physnodes(addr, max_addr); fake_physnodes(acpi, amd, num_nodes); numa_init_array(); return 0; @@ -666,10 +663,10 @@ void __init initmem_init(void) cutoff_node(j, 0, max_pfn << PAGE_SHIFT); #ifdef CONFIG_NUMA_EMU - setup_physnodes(0, max_pfn << PAGE_SHIFT, i == 0, i == 1); + setup_physnodes(0, max_pfn << PAGE_SHIFT); if (cmdline && !numa_emulation(0, max_pfn, i == 0, i == 1)) return; - setup_physnodes(0, max_pfn << PAGE_SHIFT, i == 0, i == 1); + setup_physnodes(0, max_pfn << PAGE_SHIFT); nodes_clear(node_possible_map); nodes_clear(node_online_map); #endif diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index bfa4a6af5cfe..82b1087963a2 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -323,19 +323,6 @@ static int __init nodes_cover_memory(const struct bootnode *nodes) void __init acpi_numa_arch_fixup(void) {} -#ifdef CONFIG_NUMA_EMU -void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start, - unsigned long end) -{ - int i; - - for_each_node_mask(i, mem_nodes_parsed) { - physnodes[i].start = numa_nodes[i].start; - physnodes[i].end = numa_nodes[i].end; - } -} -#endif /* CONFIG_NUMA_EMU */ - int __init x86_acpi_numa_init(void) { int ret; From d41d5a01631af821d3a3447e6613a316f5ee6c25 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Feb 2011 17:02:20 +0100 Subject: [PATCH 258/706] cgroup: Fix cgroup_subsys::exit callback Make the ::exit method act like ::attach, it is after all very nearly the same thing. The bug had no effect on correctness - fixing it is an optimization for the scheduler. Also, later perf-cgroups patches rely on it. Signed-off-by: Peter Zijlstra Acked-by: Paul Menage LKML-Reference: <1297160655.13327.92.camel@laptop> Signed-off-by: Ingo Molnar --- include/linux/cgroup.h | 3 ++- kernel/cgroup.c | 31 ++++++++++++++++++------------- kernel/sched.c | 6 ++---- 3 files changed, 22 insertions(+), 18 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index ce104e33cd22..38117d937332 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -474,7 +474,8 @@ struct cgroup_subsys { struct cgroup *old_cgrp, struct task_struct *tsk, bool threadgroup); void (*fork)(struct cgroup_subsys *ss, struct task_struct *task); - void (*exit)(struct cgroup_subsys *ss, struct task_struct *task); + void (*exit)(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup *old_cgrp, struct task_struct *task); int (*populate)(struct cgroup_subsys *ss, struct cgroup *cgrp); void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cgrp); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b24d7027b83c..f6495f33a355 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4230,20 +4230,8 @@ void cgroup_post_fork(struct task_struct *child) */ void cgroup_exit(struct task_struct *tsk, int run_callbacks) { - int i; struct css_set *cg; - - if (run_callbacks && need_forkexit_callback) { - /* - * modular subsystems can't use callbacks, so no need to lock - * the subsys array - */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss->exit) - ss->exit(ss, tsk); - } - } + int i; /* * Unlink from the css_set task list if necessary. @@ -4261,7 +4249,24 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) task_lock(tsk); cg = tsk->cgroups; tsk->cgroups = &init_css_set; + + if (run_callbacks && need_forkexit_callback) { + /* + * modular subsystems can't use callbacks, so no need to lock + * the subsys array + */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss->exit) { + struct cgroup *old_cgrp = + rcu_dereference_raw(cg->subsys[i])->cgroup; + struct cgroup *cgrp = task_cgroup(tsk, i); + ss->exit(ss, cgrp, old_cgrp, tsk); + } + } + } task_unlock(tsk); + if (cg) put_css_set_taskexit(cg); } diff --git a/kernel/sched.c b/kernel/sched.c index e142e92f38da..79e611cd83dd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -606,9 +606,6 @@ static inline struct task_group *task_group(struct task_struct *p) struct task_group *tg; struct cgroup_subsys_state *css; - if (p->flags & PF_EXITING) - return &root_task_group; - css = task_subsys_state_check(p, cpu_cgroup_subsys_id, lockdep_is_held(&task_rq(p)->lock)); tg = container_of(css, struct task_group, css); @@ -8863,7 +8860,8 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, } static void -cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task) +cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup *old_cgrp, struct task_struct *task) { /* * cgroup_exit() is called in the copy_process() failure path. From e5d1367f17ba6a6fed5fd8b74e4d5720923e0c25 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 14 Feb 2011 11:20:01 +0200 Subject: [PATCH 259/706] perf: Add cgroup support This kernel patch adds the ability to filter monitoring based on container groups (cgroups). This is for use in per-cpu mode only. The cgroup to monitor is passed as a file descriptor in the pid argument to the syscall. The file descriptor must be opened to the cgroup name in the cgroup filesystem. For instance, if the cgroup name is foo and cgroupfs is mounted in /cgroup, then the file descriptor is opened to /cgroup/foo. Cgroup mode is activated by passing PERF_FLAG_PID_CGROUP in the flags argument to the syscall. For instance to measure in cgroup foo on CPU1 assuming cgroupfs is mounted under /cgroup: struct perf_event_attr attr; int cgroup_fd, fd; cgroup_fd = open("/cgroup/foo", O_RDONLY); fd = perf_event_open(&attr, cgroup_fd, 1, -1, PERF_FLAG_PID_CGROUP); close(cgroup_fd); Signed-off-by: Stephane Eranian [ added perf_cgroup_{exit,attach} ] Signed-off-by: Peter Zijlstra LKML-Reference: <4d590250.114ddf0a.689e.4482@mx.google.com> Signed-off-by: Ingo Molnar --- include/linux/cgroup.h | 1 + include/linux/cgroup_subsys.h | 4 + include/linux/perf_event.h | 33 +- init/Kconfig | 10 + kernel/cgroup.c | 23 ++ kernel/perf_event.c | 638 ++++++++++++++++++++++++++++++++-- 6 files changed, 671 insertions(+), 38 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 38117d937332..e654fa239916 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -627,6 +627,7 @@ bool css_is_ancestor(struct cgroup_subsys_state *cg, /* Get id and depth of css */ unsigned short css_id(struct cgroup_subsys_state *css); unsigned short css_depth(struct cgroup_subsys_state *css); +struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id); #else /* !CONFIG_CGROUPS */ diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index ccefff02b6cb..cdbfcb8780ec 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -65,4 +65,8 @@ SUBSYS(net_cls) SUBSYS(blkio) #endif +#ifdef CONFIG_CGROUP_PERF +SUBSYS(perf) +#endif + /* */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index dda5b0a3ff60..38c8b2554842 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -464,6 +464,7 @@ enum perf_callchain_context { #define PERF_FLAG_FD_NO_GROUP (1U << 0) #define PERF_FLAG_FD_OUTPUT (1U << 1) +#define PERF_FLAG_PID_CGROUP (1U << 2) /* pid=cgroup id, per-cpu mode only */ #ifdef __KERNEL__ /* @@ -471,6 +472,7 @@ enum perf_callchain_context { */ #ifdef CONFIG_PERF_EVENTS +# include # include # include #endif @@ -716,6 +718,22 @@ struct swevent_hlist { #define PERF_ATTACH_GROUP 0x02 #define PERF_ATTACH_TASK 0x04 +#ifdef CONFIG_CGROUP_PERF +/* + * perf_cgroup_info keeps track of time_enabled for a cgroup. + * This is a per-cpu dynamically allocated data structure. + */ +struct perf_cgroup_info { + u64 time; + u64 timestamp; +}; + +struct perf_cgroup { + struct cgroup_subsys_state css; + struct perf_cgroup_info *info; /* timing info, one per cpu */ +}; +#endif + /** * struct perf_event - performance event kernel representation: */ @@ -832,6 +850,11 @@ struct perf_event { struct event_filter *filter; #endif +#ifdef CONFIG_CGROUP_PERF + struct perf_cgroup *cgrp; /* cgroup event is attach to */ + int cgrp_defer_enabled; +#endif + #endif /* CONFIG_PERF_EVENTS */ }; @@ -886,6 +909,7 @@ struct perf_event_context { u64 generation; int pin_count; struct rcu_head rcu_head; + int nr_cgroups; /* cgroup events present */ }; /* @@ -905,6 +929,9 @@ struct perf_cpu_context { struct list_head rotation_list; int jiffies_interval; struct pmu *active_pmu; +#ifdef CONFIG_CGROUP_PERF + struct perf_cgroup *cgrp; +#endif }; struct perf_output_handle { @@ -1040,11 +1067,11 @@ have_event: __perf_sw_event(event_id, nr, nmi, regs, addr); } -extern atomic_t perf_task_events; +extern atomic_t perf_sched_events; static inline void perf_event_task_sched_in(struct task_struct *task) { - COND_STMT(&perf_task_events, __perf_event_task_sched_in(task)); + COND_STMT(&perf_sched_events, __perf_event_task_sched_in(task)); } static inline @@ -1052,7 +1079,7 @@ void perf_event_task_sched_out(struct task_struct *task, struct task_struct *nex { perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); - COND_STMT(&perf_task_events, __perf_event_task_sched_out(task, next)); + COND_STMT(&perf_sched_events, __perf_event_task_sched_out(task, next)); } extern void perf_event_mmap(struct vm_area_struct *vma); diff --git a/init/Kconfig b/init/Kconfig index be788c0957d4..20d6bd919b8d 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -683,6 +683,16 @@ config CGROUP_MEM_RES_CTLR_SWAP_ENABLED select this option (if, for some reason, they need to disable it then noswapaccount does the trick). +config CGROUP_PERF + bool "Enable perf_event per-cpu per-container group (cgroup) monitoring" + depends on PERF_EVENTS && CGROUPS + help + This option extends the per-cpu mode to restrict monitoring to + threads which belong to the cgroup specificied and run on the + designated cpu. + + Say N if unsure. + menuconfig CGROUP_SCHED bool "Group CPU scheduler" depends on EXPERIMENTAL diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f6495f33a355..95362d15128c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4818,6 +4818,29 @@ css_get_next(struct cgroup_subsys *ss, int id, return ret; } +/* + * get corresponding css from file open on cgroupfs directory + */ +struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) +{ + struct cgroup *cgrp; + struct inode *inode; + struct cgroup_subsys_state *css; + + inode = f->f_dentry->d_inode; + /* check in cgroup filesystem dir */ + if (inode->i_op != &cgroup_dir_inode_operations) + return ERR_PTR(-EBADF); + + if (id < 0 || id >= CGROUP_SUBSYS_COUNT) + return ERR_PTR(-EINVAL); + + /* get cgroup */ + cgrp = __d_cgrp(f->f_dentry); + css = cgrp->subsys[id]; + return css ? css : ERR_PTR(-ENOENT); +} + #ifdef CONFIG_CGROUP_DEBUG static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, struct cgroup *cont) diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 3d3f282fa50e..65dcdc76d709 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -111,13 +111,23 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info) return data.ret; } +#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ + PERF_FLAG_FD_OUTPUT |\ + PERF_FLAG_PID_CGROUP) + enum event_type_t { EVENT_FLEXIBLE = 0x1, EVENT_PINNED = 0x2, EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, }; -atomic_t perf_task_events __read_mostly; +/* + * perf_sched_events : >0 events exist + * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu + */ +atomic_t perf_sched_events __read_mostly; +static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); + static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; static atomic_t nr_task_events __read_mostly; @@ -148,7 +158,11 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, enum event_type_t event_type); static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type); + enum event_type_t event_type, + struct task_struct *task); + +static void update_context_time(struct perf_event_context *ctx); +static u64 perf_event_time(struct perf_event *event); void __weak perf_event_print_debug(void) { } @@ -162,6 +176,338 @@ static inline u64 perf_clock(void) return local_clock(); } +static inline struct perf_cpu_context * +__get_cpu_context(struct perf_event_context *ctx) +{ + return this_cpu_ptr(ctx->pmu->pmu_cpu_context); +} + +#ifdef CONFIG_CGROUP_PERF + +static inline struct perf_cgroup * +perf_cgroup_from_task(struct task_struct *task) +{ + return container_of(task_subsys_state(task, perf_subsys_id), + struct perf_cgroup, css); +} + +static inline bool +perf_cgroup_match(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + + return !event->cgrp || event->cgrp == cpuctx->cgrp; +} + +static inline void perf_get_cgroup(struct perf_event *event) +{ + css_get(&event->cgrp->css); +} + +static inline void perf_put_cgroup(struct perf_event *event) +{ + css_put(&event->cgrp->css); +} + +static inline void perf_detach_cgroup(struct perf_event *event) +{ + perf_put_cgroup(event); + event->cgrp = NULL; +} + +static inline int is_cgroup_event(struct perf_event *event) +{ + return event->cgrp != NULL; +} + +static inline u64 perf_cgroup_event_time(struct perf_event *event) +{ + struct perf_cgroup_info *t; + + t = per_cpu_ptr(event->cgrp->info, event->cpu); + return t->time; +} + +static inline void __update_cgrp_time(struct perf_cgroup *cgrp) +{ + struct perf_cgroup_info *info; + u64 now; + + now = perf_clock(); + + info = this_cpu_ptr(cgrp->info); + + info->time += now - info->timestamp; + info->timestamp = now; +} + +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) +{ + struct perf_cgroup *cgrp_out = cpuctx->cgrp; + if (cgrp_out) + __update_cgrp_time(cgrp_out); +} + +static inline void update_cgrp_time_from_event(struct perf_event *event) +{ + struct perf_cgroup *cgrp = perf_cgroup_from_task(current); + /* + * do not update time when cgroup is not active + */ + if (!event->cgrp || cgrp != event->cgrp) + return; + + __update_cgrp_time(event->cgrp); +} + +static inline void +perf_cgroup_set_timestamp(struct task_struct *task, u64 now) +{ + struct perf_cgroup *cgrp; + struct perf_cgroup_info *info; + + if (!task) + return; + + cgrp = perf_cgroup_from_task(task); + info = this_cpu_ptr(cgrp->info); + info->timestamp = now; +} + +#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ +#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ + +/* + * reschedule events based on the cgroup constraint of task. + * + * mode SWOUT : schedule out everything + * mode SWIN : schedule in based on cgroup for next + */ +void perf_cgroup_switch(struct task_struct *task, int mode) +{ + struct perf_cpu_context *cpuctx; + struct pmu *pmu; + unsigned long flags; + + /* + * disable interrupts to avoid geting nr_cgroup + * changes via __perf_event_disable(). Also + * avoids preemption. + */ + local_irq_save(flags); + + /* + * we reschedule only in the presence of cgroup + * constrained events. + */ + rcu_read_lock(); + + list_for_each_entry_rcu(pmu, &pmus, entry) { + + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + perf_pmu_disable(cpuctx->ctx.pmu); + + /* + * perf_cgroup_events says at least one + * context on this CPU has cgroup events. + * + * ctx->nr_cgroups reports the number of cgroup + * events for a context. + */ + if (cpuctx->ctx.nr_cgroups > 0) { + + if (mode & PERF_CGROUP_SWOUT) { + cpu_ctx_sched_out(cpuctx, EVENT_ALL); + /* + * must not be done before ctxswout due + * to event_filter_match() in event_sched_out() + */ + cpuctx->cgrp = NULL; + } + + if (mode & PERF_CGROUP_SWIN) { + /* set cgrp before ctxsw in to + * allow event_filter_match() to not + * have to pass task around + */ + cpuctx->cgrp = perf_cgroup_from_task(task); + cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); + } + } + + perf_pmu_enable(cpuctx->ctx.pmu); + } + + rcu_read_unlock(); + + local_irq_restore(flags); +} + +static inline void perf_cgroup_sched_out(struct task_struct *task) +{ + perf_cgroup_switch(task, PERF_CGROUP_SWOUT); +} + +static inline void perf_cgroup_sched_in(struct task_struct *task) +{ + perf_cgroup_switch(task, PERF_CGROUP_SWIN); +} + +static inline int perf_cgroup_connect(int fd, struct perf_event *event, + struct perf_event_attr *attr, + struct perf_event *group_leader) +{ + struct perf_cgroup *cgrp; + struct cgroup_subsys_state *css; + struct file *file; + int ret = 0, fput_needed; + + file = fget_light(fd, &fput_needed); + if (!file) + return -EBADF; + + css = cgroup_css_from_dir(file, perf_subsys_id); + if (IS_ERR(css)) + return PTR_ERR(css); + + cgrp = container_of(css, struct perf_cgroup, css); + event->cgrp = cgrp; + + /* + * all events in a group must monitor + * the same cgroup because a task belongs + * to only one perf cgroup at a time + */ + if (group_leader && group_leader->cgrp != cgrp) { + perf_detach_cgroup(event); + ret = -EINVAL; + } else { + /* must be done before we fput() the file */ + perf_get_cgroup(event); + } + fput_light(file, fput_needed); + return ret; +} + +static inline void +perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) +{ + struct perf_cgroup_info *t; + t = per_cpu_ptr(event->cgrp->info, event->cpu); + event->shadow_ctx_time = now - t->timestamp; +} + +static inline void +perf_cgroup_defer_enabled(struct perf_event *event) +{ + /* + * when the current task's perf cgroup does not match + * the event's, we need to remember to call the + * perf_mark_enable() function the first time a task with + * a matching perf cgroup is scheduled in. + */ + if (is_cgroup_event(event) && !perf_cgroup_match(event)) + event->cgrp_defer_enabled = 1; +} + +static inline void +perf_cgroup_mark_enabled(struct perf_event *event, + struct perf_event_context *ctx) +{ + struct perf_event *sub; + u64 tstamp = perf_event_time(event); + + if (!event->cgrp_defer_enabled) + return; + + event->cgrp_defer_enabled = 0; + + event->tstamp_enabled = tstamp - event->total_time_enabled; + list_for_each_entry(sub, &event->sibling_list, group_entry) { + if (sub->state >= PERF_EVENT_STATE_INACTIVE) { + sub->tstamp_enabled = tstamp - sub->total_time_enabled; + sub->cgrp_defer_enabled = 0; + } + } +} +#else /* !CONFIG_CGROUP_PERF */ + +static inline bool +perf_cgroup_match(struct perf_event *event) +{ + return true; +} + +static inline void perf_detach_cgroup(struct perf_event *event) +{} + +static inline int is_cgroup_event(struct perf_event *event) +{ + return 0; +} + +static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event) +{ + return 0; +} + +static inline void update_cgrp_time_from_event(struct perf_event *event) +{ +} + +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) +{ +} + +static inline void perf_cgroup_sched_out(struct task_struct *task) +{ +} + +static inline void perf_cgroup_sched_in(struct task_struct *task) +{ +} + +static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, + struct perf_event_attr *attr, + struct perf_event *group_leader) +{ + return -EINVAL; +} + +static inline void +perf_cgroup_set_timestamp(struct task_struct *task, u64 now) +{ +} + +void +perf_cgroup_switch(struct task_struct *task, struct task_struct *next) +{ +} + +static inline void +perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) +{ +} + +static inline u64 perf_cgroup_event_time(struct perf_event *event) +{ + return 0; +} + +static inline void +perf_cgroup_defer_enabled(struct perf_event *event) +{ +} + +static inline void +perf_cgroup_mark_enabled(struct perf_event *event, + struct perf_event_context *ctx) +{ +} +#endif + void perf_pmu_disable(struct pmu *pmu) { int *count = this_cpu_ptr(pmu->pmu_disable_count); @@ -343,6 +689,10 @@ static void update_context_time(struct perf_event_context *ctx) static u64 perf_event_time(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; + + if (is_cgroup_event(event)) + return perf_cgroup_event_time(event); + return ctx ? ctx->time : 0; } @@ -357,9 +707,20 @@ static void update_event_times(struct perf_event *event) if (event->state < PERF_EVENT_STATE_INACTIVE || event->group_leader->state < PERF_EVENT_STATE_INACTIVE) return; - - if (ctx->is_active) + /* + * in cgroup mode, time_enabled represents + * the time the event was enabled AND active + * tasks were in the monitored cgroup. This is + * independent of the activity of the context as + * there may be a mix of cgroup and non-cgroup events. + * + * That is why we treat cgroup events differently + * here. + */ + if (is_cgroup_event(event)) run_end = perf_event_time(event); + else if (ctx->is_active) + run_end = ctx->time; else run_end = event->tstamp_stopped; @@ -371,6 +732,7 @@ static void update_event_times(struct perf_event *event) run_end = perf_event_time(event); event->total_time_running = run_end - event->tstamp_running; + } /* @@ -419,6 +781,17 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) list_add_tail(&event->group_entry, list); } + if (is_cgroup_event(event)) { + ctx->nr_cgroups++; + /* + * one more event: + * - that has cgroup constraint on event->cpu + * - that may need work on context switch + */ + atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); + jump_label_inc(&perf_sched_events); + } + list_add_rcu(&event->event_entry, &ctx->event_list); if (!ctx->nr_events) perf_pmu_rotate_start(ctx->pmu); @@ -545,6 +918,12 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) event->attach_state &= ~PERF_ATTACH_CONTEXT; + if (is_cgroup_event(event)) { + ctx->nr_cgroups--; + atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); + jump_label_dec(&perf_sched_events); + } + ctx->nr_events--; if (event->attr.inherit_stat) ctx->nr_stat--; @@ -616,7 +995,8 @@ out: static inline int event_filter_match(struct perf_event *event) { - return event->cpu == -1 || event->cpu == smp_processor_id(); + return (event->cpu == -1 || event->cpu == smp_processor_id()) + && perf_cgroup_match(event); } static void @@ -634,7 +1014,7 @@ event_sched_out(struct perf_event *event, */ if (event->state == PERF_EVENT_STATE_INACTIVE && !event_filter_match(event)) { - delta = ctx->time - event->tstamp_stopped; + delta = tstamp - event->tstamp_stopped; event->tstamp_running += delta; event->tstamp_stopped = tstamp; } @@ -678,12 +1058,6 @@ group_sched_out(struct perf_event *group_event, cpuctx->exclusive = 0; } -static inline struct perf_cpu_context * -__get_cpu_context(struct perf_event_context *ctx) -{ - return this_cpu_ptr(ctx->pmu->pmu_cpu_context); -} - /* * Cross CPU call to remove a performance event * @@ -783,6 +1157,7 @@ static int __perf_event_disable(void *info) */ if (event->state >= PERF_EVENT_STATE_INACTIVE) { update_context_time(ctx); + update_cgrp_time_from_event(event); update_group_times(event); if (event == event->group_leader) group_sched_out(event, cpuctx, ctx); @@ -851,6 +1226,41 @@ retry: raw_spin_unlock_irq(&ctx->lock); } +static void perf_set_shadow_time(struct perf_event *event, + struct perf_event_context *ctx, + u64 tstamp) +{ + /* + * use the correct time source for the time snapshot + * + * We could get by without this by leveraging the + * fact that to get to this function, the caller + * has most likely already called update_context_time() + * and update_cgrp_time_xx() and thus both timestamp + * are identical (or very close). Given that tstamp is, + * already adjusted for cgroup, we could say that: + * tstamp - ctx->timestamp + * is equivalent to + * tstamp - cgrp->timestamp. + * + * Then, in perf_output_read(), the calculation would + * work with no changes because: + * - event is guaranteed scheduled in + * - no scheduled out in between + * - thus the timestamp would be the same + * + * But this is a bit hairy. + * + * So instead, we have an explicit cgroup call to remain + * within the time time source all along. We believe it + * is cleaner and simpler to understand. + */ + if (is_cgroup_event(event)) + perf_cgroup_set_shadow_time(event, tstamp); + else + event->shadow_ctx_time = tstamp - ctx->timestamp; +} + #define MAX_INTERRUPTS (~0ULL) static void perf_log_throttle(struct perf_event *event, int enable); @@ -891,7 +1301,7 @@ event_sched_in(struct perf_event *event, event->tstamp_running += tstamp - event->tstamp_stopped; - event->shadow_ctx_time = tstamp - ctx->timestamp; + perf_set_shadow_time(event, ctx, tstamp); if (!is_software_event(event)) cpuctx->active_oncpu++; @@ -1012,7 +1422,8 @@ static void add_event_to_ctx(struct perf_event *event, event->tstamp_stopped = tstamp; } -static void perf_event_context_sched_in(struct perf_event_context *ctx); +static void perf_event_context_sched_in(struct perf_event_context *ctx, + struct task_struct *tsk); /* * Cross CPU call to install and enable a performance event @@ -1033,11 +1444,17 @@ static int __perf_install_in_context(void *info) * which do context switches with IRQs enabled. */ if (ctx->task && !cpuctx->task_ctx) - perf_event_context_sched_in(ctx); + perf_event_context_sched_in(ctx, ctx->task); raw_spin_lock(&ctx->lock); ctx->is_active = 1; update_context_time(ctx); + /* + * update cgrp time only if current cgrp + * matches event->cgrp. Must be done before + * calling add_event_to_ctx() + */ + update_cgrp_time_from_event(event); add_event_to_ctx(event, ctx); @@ -1175,10 +1592,19 @@ static int __perf_event_enable(void *info) if (event->state >= PERF_EVENT_STATE_INACTIVE) goto unlock; + + /* + * set current task's cgroup time reference point + */ + perf_cgroup_set_timestamp(current, perf_clock()); + __perf_event_mark_enabled(event, ctx); - if (!event_filter_match(event)) + if (!event_filter_match(event)) { + if (is_cgroup_event(event)) + perf_cgroup_defer_enabled(event); goto unlock; + } /* * If the event is in a group and isn't the group leader, @@ -1307,6 +1733,7 @@ static void ctx_sched_out(struct perf_event_context *ctx, if (likely(!ctx->nr_events)) goto out; update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx); if (!ctx->nr_active) goto out; @@ -1496,6 +1923,14 @@ void __perf_event_task_sched_out(struct task_struct *task, for_each_task_context_nr(ctxn) perf_event_context_sched_out(task, ctxn, next); + + /* + * if cgroup events exist on this CPU, then we need + * to check if we have to switch out PMU state. + * cgroup event are system-wide mode only + */ + if (atomic_read(&__get_cpu_var(perf_cgroup_events))) + perf_cgroup_sched_out(task); } static void task_ctx_sched_out(struct perf_event_context *ctx, @@ -1534,6 +1969,10 @@ ctx_pinned_sched_in(struct perf_event_context *ctx, if (!event_filter_match(event)) continue; + /* may need to reset tstamp_enabled */ + if (is_cgroup_event(event)) + perf_cgroup_mark_enabled(event, ctx); + if (group_can_go_on(event, cpuctx, 1)) group_sched_in(event, cpuctx, ctx); @@ -1566,6 +2005,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, if (!event_filter_match(event)) continue; + /* may need to reset tstamp_enabled */ + if (is_cgroup_event(event)) + perf_cgroup_mark_enabled(event, ctx); + if (group_can_go_on(event, cpuctx, can_add_hw)) { if (group_sched_in(event, cpuctx, ctx)) can_add_hw = 0; @@ -1576,15 +2019,19 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, static void ctx_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, - enum event_type_t event_type) + enum event_type_t event_type, + struct task_struct *task) { + u64 now; + raw_spin_lock(&ctx->lock); ctx->is_active = 1; if (likely(!ctx->nr_events)) goto out; - ctx->timestamp = perf_clock(); - + now = perf_clock(); + ctx->timestamp = now; + perf_cgroup_set_timestamp(task, now); /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. @@ -1601,11 +2048,12 @@ out: } static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type) + enum event_type_t event_type, + struct task_struct *task) { struct perf_event_context *ctx = &cpuctx->ctx; - ctx_sched_in(ctx, cpuctx, event_type); + ctx_sched_in(ctx, cpuctx, event_type, task); } static void task_ctx_sched_in(struct perf_event_context *ctx, @@ -1617,11 +2065,12 @@ static void task_ctx_sched_in(struct perf_event_context *ctx, if (cpuctx->task_ctx == ctx) return; - ctx_sched_in(ctx, cpuctx, event_type); + ctx_sched_in(ctx, cpuctx, event_type, NULL); cpuctx->task_ctx = ctx; } -static void perf_event_context_sched_in(struct perf_event_context *ctx) +static void perf_event_context_sched_in(struct perf_event_context *ctx, + struct task_struct *task) { struct perf_cpu_context *cpuctx; @@ -1637,9 +2086,9 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx) */ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - ctx_sched_in(ctx, cpuctx, EVENT_PINNED); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); - ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); + ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); + ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); cpuctx->task_ctx = ctx; @@ -1672,8 +2121,15 @@ void __perf_event_task_sched_in(struct task_struct *task) if (likely(!ctx)) continue; - perf_event_context_sched_in(ctx); + perf_event_context_sched_in(ctx, task); } + /* + * if cgroup events exist on this CPU, then we need + * to check if we have to switch in PMU state. + * cgroup event are system-wide mode only + */ + if (atomic_read(&__get_cpu_var(perf_cgroup_events))) + perf_cgroup_sched_in(task); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) @@ -1873,7 +2329,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) if (ctx) rotate_ctx(ctx); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current); if (ctx) task_ctx_sched_in(ctx, EVENT_FLEXIBLE); @@ -1952,7 +2408,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) raw_spin_unlock(&ctx->lock); - perf_event_context_sched_in(ctx); + perf_event_context_sched_in(ctx, ctx->task); out: local_irq_restore(flags); } @@ -1977,8 +2433,10 @@ static void __perf_event_read(void *info) return; raw_spin_lock(&ctx->lock); - if (ctx->is_active) + if (ctx->is_active) { update_context_time(ctx); + update_cgrp_time_from_event(event); + } update_event_times(event); if (event->state == PERF_EVENT_STATE_ACTIVE) event->pmu->read(event); @@ -2009,8 +2467,10 @@ static u64 perf_event_read(struct perf_event *event) * (e.g., thread is blocked), in that case * we cannot update context time */ - if (ctx->is_active) + if (ctx->is_active) { update_context_time(ctx); + update_cgrp_time_from_event(event); + } update_event_times(event); raw_spin_unlock_irqrestore(&ctx->lock, flags); } @@ -2395,7 +2855,7 @@ static void free_event(struct perf_event *event) if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_dec(&perf_task_events); + jump_label_dec(&perf_sched_events); if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) @@ -2411,6 +2871,9 @@ static void free_event(struct perf_event *event) event->buffer = NULL; } + if (is_cgroup_event(event)) + perf_detach_cgroup(event); + if (event->destroy) event->destroy(event); @@ -5300,6 +5763,7 @@ static void task_clock_event_read(struct perf_event *event) if (!in_nmi()) { update_context_time(event->ctx); + update_cgrp_time_from_event(event); time = event->ctx->time; } else { u64 now = perf_clock(); @@ -5725,7 +6189,7 @@ done: if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_inc(&perf_task_events); + jump_label_inc(&perf_sched_events); if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) @@ -5900,7 +6364,7 @@ SYSCALL_DEFINE5(perf_event_open, int err; /* for future expandability... */ - if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) + if (flags & ~PERF_FLAG_ALL) return -EINVAL; err = perf_copy_attr(attr_uptr, &attr); @@ -5917,6 +6381,15 @@ SYSCALL_DEFINE5(perf_event_open, return -EINVAL; } + /* + * In cgroup mode, the pid argument is used to pass the fd + * opened to the cgroup directory in cgroupfs. The cpu argument + * designates the cpu on which to monitor threads from that + * cgroup. + */ + if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) + return -EINVAL; + event_fd = get_unused_fd_flags(O_RDWR); if (event_fd < 0) return event_fd; @@ -5934,7 +6407,7 @@ SYSCALL_DEFINE5(perf_event_open, group_leader = NULL; } - if (pid != -1) { + if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) { task = find_lively_task_by_vpid(pid); if (IS_ERR(task)) { err = PTR_ERR(task); @@ -5948,6 +6421,12 @@ SYSCALL_DEFINE5(perf_event_open, goto err_task; } + if (flags & PERF_FLAG_PID_CGROUP) { + err = perf_cgroup_connect(pid, event, &attr, group_leader); + if (err) + goto err_alloc; + } + /* * Special case software events and allow them to be part of * any hardware group. @@ -6808,3 +7287,92 @@ unlock: return ret; } device_initcall(perf_event_sysfs_init); + +#ifdef CONFIG_CGROUP_PERF +static struct cgroup_subsys_state *perf_cgroup_create( + struct cgroup_subsys *ss, struct cgroup *cont) +{ + struct perf_cgroup *jc; + struct perf_cgroup_info *t; + int c; + + jc = kmalloc(sizeof(*jc), GFP_KERNEL); + if (!jc) + return ERR_PTR(-ENOMEM); + + memset(jc, 0, sizeof(*jc)); + + jc->info = alloc_percpu(struct perf_cgroup_info); + if (!jc->info) { + kfree(jc); + return ERR_PTR(-ENOMEM); + } + + for_each_possible_cpu(c) { + t = per_cpu_ptr(jc->info, c); + t->time = 0; + t->timestamp = 0; + } + return &jc->css; +} + +static void perf_cgroup_destroy(struct cgroup_subsys *ss, + struct cgroup *cont) +{ + struct perf_cgroup *jc; + jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), + struct perf_cgroup, css); + free_percpu(jc->info); + kfree(jc); +} + +static int __perf_cgroup_move(void *info) +{ + struct task_struct *task = info; + perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN); + return 0; +} + +static void perf_cgroup_move(struct task_struct *task) +{ + task_function_call(task, __perf_cgroup_move, task); +} + +static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup *old_cgrp, struct task_struct *task, + bool threadgroup) +{ + perf_cgroup_move(task); + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &task->thread_group, thread_group) { + perf_cgroup_move(c); + } + rcu_read_unlock(); + } +} + +static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup *old_cgrp, struct task_struct *task) +{ + /* + * cgroup_exit() is called in the copy_process() failure path. + * Ignore this case since the task hasn't ran yet, this avoids + * trying to poke a half freed task state from generic code. + */ + if (!(task->flags & PF_EXITING)) + return; + + perf_cgroup_move(task); +} + +struct cgroup_subsys perf_subsys = { + .name = "perf_event", + .subsys_id = perf_subsys_id, + .create = perf_cgroup_create, + .destroy = perf_cgroup_destroy, + .exit = perf_cgroup_exit, + .attach = perf_cgroup_attach, +}; +#endif /* CONFIG_CGROUP_PERF */ From 023695d96ee06f36cf5014e286edcd623e9fb847 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 14 Feb 2011 11:20:01 +0200 Subject: [PATCH 260/706] perf tool: Add cgroup support This patch adds the ability to filter monitoring based on container groups (cgroups) for both perf stat and perf record. It is possible to monitor multiple cgroup in parallel. There is one cgroup per event. The cgroups to monitor are passed via a new -G option followed by a comma separated list of cgroup names. The cgroup filesystem has to be mounted. Given a cgroup name, the perf tool finds the corresponding directory in the cgroup filesystem and opens it. It then passes that file descriptor to the kernel. Example: $ perf stat -B -a -e cycles:u,cycles:u,cycles:u -G test1,,test2 -- sleep 1 Performance counter stats for 'sleep 1': 2,368,667,414 cycles test1 2,369,661,459 cycles cycles test2 1.001856890 seconds time elapsed Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <4d590290.825bdf0a.7d0a.4890@mx.google.com> Signed-off-by: Ingo Molnar --- tools/perf/Documentation/perf-record.txt | 11 ++ tools/perf/Documentation/perf-stat.txt | 11 ++ tools/perf/Makefile | 2 + tools/perf/builtin-record.c | 9 ++ tools/perf/builtin-stat.c | 40 ++++- tools/perf/util/cgroup.c | 178 +++++++++++++++++++++++ tools/perf/util/cgroup.h | 17 +++ tools/perf/util/evsel.c | 16 +- tools/perf/util/evsel.h | 2 + 9 files changed, 276 insertions(+), 10 deletions(-) create mode 100644 tools/perf/util/cgroup.c create mode 100644 tools/perf/util/cgroup.h diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index e032716c839b..5a520f825295 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -137,6 +137,17 @@ Do not update the builid cache. This saves some overhead in situations where the information in the perf.data file (which includes buildids) is sufficient. +-G name,...:: +--cgroup name,...:: +monitor only in the container (cgroup) called "name". This option is available only +in per-cpu mode. The cgroup filesystem must be mounted. All threads belonging to +container "name" are monitored when they run on the monitored CPUs. Multiple cgroups +can be provided. Each cgroup is applied to the corresponding event, i.e., first cgroup +to first event, second cgroup to second event and so on. It is possible to provide +an empty cgroup (monitor all the time) using, e.g., -G foo,,bar. Cgroups must have +corresponding events, i.e., they always refer to events defined earlier on the command +line. + SEE ALSO -------- linkperf:perf-stat[1], linkperf:perf-list[1] diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt index b6da7affbbee..918cc38ee6d1 100644 --- a/tools/perf/Documentation/perf-stat.txt +++ b/tools/perf/Documentation/perf-stat.txt @@ -83,6 +83,17 @@ This option is only valid in system-wide mode. print counts using a CSV-style output to make it easy to import directly into spreadsheets. Columns are separated by the string specified in SEP. +-G name:: +--cgroup name:: +monitor only in the container (cgroup) called "name". This option is available only +in per-cpu mode. The cgroup filesystem must be mounted. All threads belonging to +container "name" are monitored when they run on the monitored CPUs. Multiple cgroups +can be provided. Each cgroup is applied to the corresponding event, i.e., first cgroup +to first event, second cgroup to second event and so on. It is possible to provide +an empty cgroup (monitor all the time) using, e.g., -G foo,,bar. Cgroups must have +corresponding events, i.e., they always refer to events defined earlier on the command +line. + EXAMPLES -------- diff --git a/tools/perf/Makefile b/tools/perf/Makefile index 94f73abdc56a..bc4d9bf8a556 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -442,6 +442,7 @@ LIB_H += util/pstack.h LIB_H += util/cpumap.h LIB_H += util/top.h LIB_H += $(ARCH_INCLUDE) +LIB_H += util/cgroup.h LIB_OBJS += $(OUTPUT)util/abspath.o LIB_OBJS += $(OUTPUT)util/alias.o @@ -496,6 +497,7 @@ LIB_OBJS += $(OUTPUT)util/probe-event.o LIB_OBJS += $(OUTPUT)util/util.o LIB_OBJS += $(OUTPUT)util/xyarray.o LIB_OBJS += $(OUTPUT)util/cpumap.o +LIB_OBJS += $(OUTPUT)util/cgroup.o BUILTIN_OBJS += $(OUTPUT)builtin-annotate.o diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 12e0e41696d9..a4aaadcb4c8b 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -807,6 +807,9 @@ const struct option record_options[] = { "do not update the buildid cache"), OPT_BOOLEAN('B', "no-buildid", &no_buildid, "do not collect buildids in perf.data"), + OPT_CALLBACK('G', "cgroup", &evsel_list, "name", + "monitor event in cgroup name only", + parse_cgroups), OPT_END() }; @@ -835,6 +838,12 @@ int cmd_record(int argc, const char **argv, const char *prefix __used) write_mode = WRITE_FORCE; } + if (nr_cgroups && !system_wide) { + fprintf(stderr, "cgroup monitoring only available in" + " system-wide mode\n"); + usage_with_options(record_usage, record_options); + } + symbol__init(); if (no_buildid_cache || no_buildid) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 806a9998fcd5..21c025222496 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -390,6 +390,9 @@ static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg) fprintf(stderr, fmt, cpustr, msecs, csv_sep, event_name(evsel)); + if (evsel->cgrp) + fprintf(stderr, "%s%s", csv_sep, evsel->cgrp->name); + if (csv_output) return; @@ -420,6 +423,9 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) fprintf(stderr, fmt, cpustr, avg, csv_sep, event_name(evsel)); + if (evsel->cgrp) + fprintf(stderr, "%s%s", csv_sep, evsel->cgrp->name); + if (csv_output) return; @@ -460,9 +466,17 @@ static void print_counter_aggr(struct perf_evsel *counter) int scaled = counter->counts->scaled; if (scaled == -1) { - fprintf(stderr, "%*s%s%-24s\n", + fprintf(stderr, "%*s%s%*s", csv_output ? 0 : 18, - "", csv_sep, event_name(counter)); + "", + csv_sep, + csv_output ? 0 : -24, + event_name(counter)); + + if (counter->cgrp) + fprintf(stderr, "%s%s", csv_sep, counter->cgrp->name); + + fputc('\n', stderr); return; } @@ -487,7 +501,6 @@ static void print_counter_aggr(struct perf_evsel *counter) fprintf(stderr, " (scaled from %.2f%%)", 100 * avg_running / avg_enabled); } - fprintf(stderr, "\n"); } @@ -505,14 +518,18 @@ static void print_counter(struct perf_evsel *counter) ena = counter->counts->cpu[cpu].ena; run = counter->counts->cpu[cpu].run; if (run == 0 || ena == 0) { - fprintf(stderr, "CPU%*d%s%*s%s%-24s", + fprintf(stderr, "CPU%*d%s%*s%s%*s", csv_output ? 0 : -4, evsel_list->cpus->map[cpu], csv_sep, csv_output ? 0 : 18, "", csv_sep, + csv_output ? 0 : -24, event_name(counter)); - fprintf(stderr, "\n"); + if (counter->cgrp) + fprintf(stderr, "%s%s", csv_sep, counter->cgrp->name); + + fputc('\n', stderr); continue; } @@ -529,7 +546,7 @@ static void print_counter(struct perf_evsel *counter) 100.0 * run / ena); } } - fprintf(stderr, "\n"); + fputc('\n', stderr); } } @@ -642,6 +659,9 @@ static const struct option options[] = { "disable CPU count aggregation"), OPT_STRING('x', "field-separator", &csv_sep, "separator", "print counts with custom separator"), + OPT_CALLBACK('G', "cgroup", &evsel_list, "name", + "monitor event in cgroup name only", + parse_cgroups), OPT_END() }; @@ -682,9 +702,13 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used) if (run_count <= 0) usage_with_options(stat_usage, options); - /* no_aggr is for system-wide only */ - if (no_aggr && !system_wide) + /* no_aggr, cgroup are for system-wide only */ + if ((no_aggr || nr_cgroups) && !system_wide) { + fprintf(stderr, "both cgroup and no-aggregation " + "modes only available in system-wide mode\n"); + usage_with_options(stat_usage, options); + } /* Set attrs and nr_counters if no event is selected and !null_run */ if (!null_run && !evsel_list->nr_entries) { diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c new file mode 100644 index 000000000000..9fea75535221 --- /dev/null +++ b/tools/perf/util/cgroup.c @@ -0,0 +1,178 @@ +#include "util.h" +#include "../perf.h" +#include "parse-options.h" +#include "evsel.h" +#include "cgroup.h" +#include "debugfs.h" /* MAX_PATH, STR() */ +#include "evlist.h" + +int nr_cgroups; + +static int +cgroupfs_find_mountpoint(char *buf, size_t maxlen) +{ + FILE *fp; + char mountpoint[MAX_PATH+1], tokens[MAX_PATH+1], type[MAX_PATH+1]; + char *token, *saved_ptr; + int found = 0; + + fp = fopen("/proc/mounts", "r"); + if (!fp) + return -1; + + /* + * in order to handle split hierarchy, we need to scan /proc/mounts + * and inspect every cgroupfs mount point to find one that has + * perf_event subsystem + */ + while (fscanf(fp, "%*s %"STR(MAX_PATH)"s %"STR(MAX_PATH)"s %" + STR(MAX_PATH)"s %*d %*d\n", + mountpoint, type, tokens) == 3) { + + if (!strcmp(type, "cgroup")) { + + token = strtok_r(tokens, ",", &saved_ptr); + + while (token != NULL) { + if (!strcmp(token, "perf_event")) { + found = 1; + break; + } + token = strtok_r(NULL, ",", &saved_ptr); + } + } + if (found) + break; + } + fclose(fp); + if (!found) + return -1; + + if (strlen(mountpoint) < maxlen) { + strcpy(buf, mountpoint); + return 0; + } + return -1; +} + +static int open_cgroup(char *name) +{ + char path[MAX_PATH+1]; + char mnt[MAX_PATH+1]; + int fd; + + + if (cgroupfs_find_mountpoint(mnt, MAX_PATH+1)) + return -1; + + snprintf(path, MAX_PATH, "%s/%s", mnt, name); + + fd = open(path, O_RDONLY); + if (fd == -1) + fprintf(stderr, "no access to cgroup %s\n", path); + + return fd; +} + +static int add_cgroup(struct perf_evlist *evlist, char *str) +{ + struct perf_evsel *counter; + struct cgroup_sel *cgrp = NULL; + int n; + /* + * check if cgrp is already defined, if so we reuse it + */ + list_for_each_entry(counter, &evlist->entries, node) { + cgrp = counter->cgrp; + if (!cgrp) + continue; + if (!strcmp(cgrp->name, str)) + break; + + cgrp = NULL; + } + + if (!cgrp) { + cgrp = zalloc(sizeof(*cgrp)); + if (!cgrp) + return -1; + + cgrp->name = str; + + cgrp->fd = open_cgroup(str); + if (cgrp->fd == -1) { + free(cgrp); + return -1; + } + } + + /* + * find corresponding event + * if add cgroup N, then need to find event N + */ + n = 0; + list_for_each_entry(counter, &evlist->entries, node) { + if (n == nr_cgroups) + goto found; + n++; + } + if (cgrp->refcnt == 0) + free(cgrp); + + return -1; +found: + cgrp->refcnt++; + counter->cgrp = cgrp; + return 0; +} + +void close_cgroup(struct cgroup_sel *cgrp) +{ + if (!cgrp) + return; + + /* XXX: not reentrant */ + if (--cgrp->refcnt == 0) { + close(cgrp->fd); + free(cgrp->name); + free(cgrp); + } +} + +int parse_cgroups(const struct option *opt __used, const char *str, + int unset __used) +{ + struct perf_evlist *evlist = *(struct perf_evlist **)opt->value; + const char *p, *e, *eos = str + strlen(str); + char *s; + int ret; + + if (list_empty(&evlist->entries)) { + fprintf(stderr, "must define events before cgroups\n"); + return -1; + } + + for (;;) { + p = strchr(str, ','); + e = p ? p : eos; + + /* allow empty cgroups, i.e., skip */ + if (e - str) { + /* termination added */ + s = strndup(str, e - str); + if (!s) + return -1; + ret = add_cgroup(evlist, s); + if (ret) { + free(s); + return -1; + } + } + /* nr_cgroups is increased een for empty cgroups */ + nr_cgroups++; + if (!p) + break; + str = p+1; + } + return 0; +} diff --git a/tools/perf/util/cgroup.h b/tools/perf/util/cgroup.h new file mode 100644 index 000000000000..89acd6debdc5 --- /dev/null +++ b/tools/perf/util/cgroup.h @@ -0,0 +1,17 @@ +#ifndef __CGROUP_H__ +#define __CGROUP_H__ + +struct option; + +struct cgroup_sel { + char *name; + int fd; + int refcnt; +}; + + +extern int nr_cgroups; /* number of explicit cgroups defined */ +extern void close_cgroup(struct cgroup_sel *cgrp); +extern int parse_cgroups(const struct option *opt, const char *str, int unset); + +#endif /* __CGROUP_H__ */ diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 211063eed474..c974e08d07ab 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -85,6 +85,7 @@ void perf_evsel__exit(struct perf_evsel *evsel) void perf_evsel__delete(struct perf_evsel *evsel) { perf_evsel__exit(evsel); + close_cgroup(evsel->cgrp); free(evsel); } @@ -163,21 +164,32 @@ static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus, struct thread_map *threads, bool group, bool inherit) { int cpu, thread; + unsigned long flags = 0; + int pid = -1; if (evsel->fd == NULL && perf_evsel__alloc_fd(evsel, cpus->nr, threads->nr) < 0) return -1; + if (evsel->cgrp) { + flags = PERF_FLAG_PID_CGROUP; + pid = evsel->cgrp->fd; + } + for (cpu = 0; cpu < cpus->nr; cpu++) { int group_fd = -1; evsel->attr.inherit = (cpus->map[cpu] < 0) && inherit; for (thread = 0; thread < threads->nr; thread++) { + + if (!evsel->cgrp) + pid = threads->map[thread]; + FD(evsel, cpu, thread) = sys_perf_event_open(&evsel->attr, - threads->map[thread], + pid, cpus->map[cpu], - group_fd, 0); + group_fd, flags); if (FD(evsel, cpu, thread) < 0) goto out_close; diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index eecdc3aabc14..1d3d5a3dbe60 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -6,6 +6,7 @@ #include "../../../include/linux/perf_event.h" #include "types.h" #include "xyarray.h" +#include "cgroup.h" struct perf_counts_values { union { @@ -45,6 +46,7 @@ struct perf_evsel { struct perf_counts *counts; int idx; void *priv; + struct cgroup_sel *cgrp; }; struct cpu_map; From d45dd923fcc620c948bd1eda16cc61426ac31646 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 2 Feb 2011 17:40:56 +0100 Subject: [PATCH 261/706] perf, x86: Use helper function in x86_pmu_enable_all() Use helper function in x86_pmu_enable_all() to minimize access to x86_pmu.eventsel in the fast path. The counter's msr address is now calculated using struct hw_perf_event. Later we add code that calculates the msr addresses with a table lookup which shouldn't be done in the fast path. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra LKML-Reference: <1296664860-10886-2-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 4d98789b0664..70d6d8fc2411 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -642,21 +642,24 @@ static void x86_pmu_disable(struct pmu *pmu) x86_pmu.disable_all(); } +static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, + u64 enable_mask) +{ + wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask); +} + static void x86_pmu_enable_all(int added) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; for (idx = 0; idx < x86_pmu.num_counters; idx++) { - struct perf_event *event = cpuc->events[idx]; - u64 val; + struct hw_perf_event *hwc = &cpuc->events[idx]->hw; if (!test_bit(idx, cpuc->active_mask)) continue; - val = event->hw.config; - val |= ARCH_PERFMON_EVENTSEL_ENABLE; - wrmsrl(x86_pmu.eventsel + idx, val); + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); } } @@ -915,12 +918,6 @@ static void x86_pmu_enable(struct pmu *pmu) x86_pmu.enable_all(added); } -static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, - u64 enable_mask) -{ - wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask); -} - static inline void x86_pmu_disable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; From 41bf498949a263fa0b2d32524b89d696ac330e94 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 2 Feb 2011 17:40:57 +0100 Subject: [PATCH 262/706] perf, x86: Calculate perfctr msr addresses in helper functions This patch adds helper functions to calculate perfctr msr addresses. We need this to later add support for AMD family 15h cpus. For this we have to change the algorithms to generate the perfctr's msr addresses. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra LKML-Reference: <1296664860-10886-3-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 36 ++++++++++++++++---------- arch/x86/kernel/cpu/perf_event_intel.c | 4 +-- 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 70d6d8fc2411..ee40c1ad0ebc 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -321,6 +321,16 @@ again: return new_raw_count; } +static inline unsigned int x86_pmu_config_addr(int index) +{ + return x86_pmu.eventsel + index; +} + +static inline unsigned int x86_pmu_event_addr(int index) +{ + return x86_pmu.perfctr + index; +} + static atomic_t active_events; static DEFINE_MUTEX(pmc_reserve_mutex); @@ -331,12 +341,12 @@ static bool reserve_pmc_hardware(void) int i; for (i = 0; i < x86_pmu.num_counters; i++) { - if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) + if (!reserve_perfctr_nmi(x86_pmu_event_addr(i))) goto perfctr_fail; } for (i = 0; i < x86_pmu.num_counters; i++) { - if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) + if (!reserve_evntsel_nmi(x86_pmu_config_addr(i))) goto eventsel_fail; } @@ -344,13 +354,13 @@ static bool reserve_pmc_hardware(void) eventsel_fail: for (i--; i >= 0; i--) - release_evntsel_nmi(x86_pmu.eventsel + i); + release_evntsel_nmi(x86_pmu_config_addr(i)); i = x86_pmu.num_counters; perfctr_fail: for (i--; i >= 0; i--) - release_perfctr_nmi(x86_pmu.perfctr + i); + release_perfctr_nmi(x86_pmu_event_addr(i)); return false; } @@ -360,8 +370,8 @@ static void release_pmc_hardware(void) int i; for (i = 0; i < x86_pmu.num_counters; i++) { - release_perfctr_nmi(x86_pmu.perfctr + i); - release_evntsel_nmi(x86_pmu.eventsel + i); + release_perfctr_nmi(x86_pmu_event_addr(i)); + release_evntsel_nmi(x86_pmu_config_addr(i)); } } @@ -382,7 +392,7 @@ static bool check_hw_exists(void) * complain and bail. */ for (i = 0; i < x86_pmu.num_counters; i++) { - reg = x86_pmu.eventsel + i; + reg = x86_pmu_config_addr(i); ret = rdmsrl_safe(reg, &val); if (ret) goto msr_fail; @@ -407,8 +417,8 @@ static bool check_hw_exists(void) * that don't trap on the MSR access and always return 0s. */ val = 0xabcdUL; - ret = checking_wrmsrl(x86_pmu.perfctr, val); - ret |= rdmsrl_safe(x86_pmu.perfctr, &val_new); + ret = checking_wrmsrl(x86_pmu_event_addr(0), val); + ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new); if (ret || val != val_new) goto msr_fail; @@ -617,11 +627,11 @@ static void x86_pmu_disable_all(void) if (!test_bit(idx, cpuc->active_mask)) continue; - rdmsrl(x86_pmu.eventsel + idx, val); + rdmsrl(x86_pmu_config_addr(idx), val); if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE)) continue; val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; - wrmsrl(x86_pmu.eventsel + idx, val); + wrmsrl(x86_pmu_config_addr(idx), val); } } @@ -1110,8 +1120,8 @@ void perf_event_print_debug(void) pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); for (idx = 0; idx < x86_pmu.num_counters; idx++) { - rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); - rdmsrl(x86_pmu.perfctr + idx, pmc_count); + rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl); + rdmsrl(x86_pmu_event_addr(idx), pmc_count); prev_left = per_cpu(pmc_prev_left[idx], cpu); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 008835c1d79c..084b38362db7 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -691,8 +691,8 @@ static void intel_pmu_reset(void) printk("clearing PMU state on CPU#%d\n", smp_processor_id()); for (idx = 0; idx < x86_pmu.num_counters; idx++) { - checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); - checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); + checking_wrmsrl(x86_pmu_config_addr(idx), 0ull); + checking_wrmsrl(x86_pmu_event_addr(idx), 0ull); } for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); From 69d8e1e8ac0a7d829f1c0fd5bd07eb3022d9a1a0 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 2 Feb 2011 17:40:58 +0100 Subject: [PATCH 263/706] perf, x86: Add new AMD family 15h msrs to perfctr reservation code This patch allows the reservation of perfctrs with new msr addresses introduced for AMD cpu family 15h (0xc0010200/0xc0010201, etc). Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra LKML-Reference: <1296664860-10886-4-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perfctr-watchdog.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index d5a236615501..966512b2cacf 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -46,6 +46,8 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) /* returns the bit offset of the performance counter register */ switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: + if (msr >= MSR_F15H_PERF_CTR) + return (msr - MSR_F15H_PERF_CTR) >> 1; return msr - MSR_K7_PERFCTR0; case X86_VENDOR_INTEL: if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) @@ -70,6 +72,8 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) /* returns the bit offset of the event selection register */ switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: + if (msr >= MSR_F15H_PERF_CTL) + return (msr - MSR_F15H_PERF_CTL) >> 1; return msr - MSR_K7_EVNTSEL0; case X86_VENDOR_INTEL: if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) From 73d6e52206a20354738418625cedc244cbfd5023 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 2 Feb 2011 17:40:59 +0100 Subject: [PATCH 264/706] perf, x86: Store perfctr msr addresses in config_base/event_base Instead of storing the base addresses we can store the counter's msr addresses directly in config_base/event_base of struct hw_perf_event. This avoids recalculating the address with each msr access. The addresses are configured one time. We also need this change to later modify the address calculation. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra LKML-Reference: <1296664860-10886-5-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 21 ++++++++------------- arch/x86/kernel/cpu/perf_event_p4.c | 8 ++++---- arch/x86/kernel/cpu/perf_event_p6.c | 4 ++-- 3 files changed, 14 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index ee40c1ad0ebc..316194330da0 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -298,7 +298,7 @@ x86_perf_event_update(struct perf_event *event) */ again: prev_raw_count = local64_read(&hwc->prev_count); - rdmsrl(hwc->event_base + idx, new_raw_count); + rdmsrl(hwc->event_base, new_raw_count); if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) @@ -655,7 +655,7 @@ static void x86_pmu_disable(struct pmu *pmu) static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, u64 enable_mask) { - wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask); + wrmsrl(hwc->config_base, hwc->config | enable_mask); } static void x86_pmu_enable_all(int added) @@ -834,15 +834,10 @@ static inline void x86_assign_hw_event(struct perf_event *event, hwc->event_base = 0; } else if (hwc->idx >= X86_PMC_IDX_FIXED) { hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; - /* - * We set it so that event_base + idx in wrmsr/rdmsr maps to - * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: - */ - hwc->event_base = - MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; + hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0; } else { - hwc->config_base = x86_pmu.eventsel; - hwc->event_base = x86_pmu.perfctr; + hwc->config_base = x86_pmu_config_addr(hwc->idx); + hwc->event_base = x86_pmu_event_addr(hwc->idx); } } @@ -932,7 +927,7 @@ static inline void x86_pmu_disable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - wrmsrl(hwc->config_base + hwc->idx, hwc->config); + wrmsrl(hwc->config_base, hwc->config); } static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); @@ -985,7 +980,7 @@ x86_perf_event_set_period(struct perf_event *event) */ local64_set(&hwc->prev_count, (u64)-left); - wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask); + wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); /* * Due to erratum on certan cpu we need @@ -993,7 +988,7 @@ x86_perf_event_set_period(struct perf_event *event) * is updated properly */ if (x86_pmu.perfctr_second_write) { - wrmsrl(hwc->event_base + idx, + wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); } diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index ff751a9f182b..3769ac822f96 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -764,9 +764,9 @@ static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc) u64 v; /* an official way for overflow indication */ - rdmsrl(hwc->config_base + hwc->idx, v); + rdmsrl(hwc->config_base, v); if (v & P4_CCCR_OVF) { - wrmsrl(hwc->config_base + hwc->idx, v & ~P4_CCCR_OVF); + wrmsrl(hwc->config_base, v & ~P4_CCCR_OVF); return 1; } @@ -815,7 +815,7 @@ static inline void p4_pmu_disable_event(struct perf_event *event) * state we need to clear P4_CCCR_OVF, otherwise interrupt get * asserted again and again */ - (void)checking_wrmsrl(hwc->config_base + hwc->idx, + (void)checking_wrmsrl(hwc->config_base, (u64)(p4_config_unpack_cccr(hwc->config)) & ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED); } @@ -885,7 +885,7 @@ static void p4_pmu_enable_event(struct perf_event *event) p4_pmu_enable_pebs(hwc->config); (void)checking_wrmsrl(escr_addr, escr_conf); - (void)checking_wrmsrl(hwc->config_base + hwc->idx, + (void)checking_wrmsrl(hwc->config_base, (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE); } diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index 34ba07be2cda..20c097e33860 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -68,7 +68,7 @@ p6_pmu_disable_event(struct perf_event *event) if (cpuc->enabled) val |= ARCH_PERFMON_EVENTSEL_ENABLE; - (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); + (void)checking_wrmsrl(hwc->config_base, val); } static void p6_pmu_enable_event(struct perf_event *event) @@ -81,7 +81,7 @@ static void p6_pmu_enable_event(struct perf_event *event) if (cpuc->enabled) val |= ARCH_PERFMON_EVENTSEL_ENABLE; - (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); + (void)checking_wrmsrl(hwc->config_base, val); } static __initconst const struct x86_pmu p6_pmu = { From 4979d2729af22f6ce8faa325fc60a85a2c2daa02 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 2 Feb 2011 17:36:12 +0100 Subject: [PATCH 265/706] perf, x86: Add support for AMD family 15h core counters This patch adds support for AMD family 15h core counters. There are major changes compared to family 10h. First, there is a new perfctr msr range for up to 6 counters. Northbridge counters are separate now. This patch only adds support for core counters. Second, certain events may only be scheduled on certain counters. For this we need to extend the event scheduling and constraints. We use cpu feature flags to calculate family 15h msr address offsets. This way we later can implement a faster ALTERNATIVE() version for this. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra LKML-Reference: <20110215135210.GB5874@erda.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 2 + arch/x86/kernel/cpu/perf_event.c | 12 +- arch/x86/kernel/cpu/perf_event_amd.c | 175 ++++++++++++++++++++++++++- 3 files changed, 186 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 220e2ea08e80..91f3e087cf21 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -160,6 +160,7 @@ #define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */ #define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */ #define X86_FEATURE_TOPOEXT (6*32+22) /* topology extensions CPUID leafs */ +#define X86_FEATURE_PERFCTR_CORE (6*32+23) /* core performance counter extensions */ /* * Auxiliary flags: Linux defined - For features scattered in various @@ -279,6 +280,7 @@ extern const char * const x86_power_flags[32]; #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) #define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) #define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ) +#define cpu_has_perfctr_core boot_cpu_has(X86_FEATURE_PERFCTR_CORE) #if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) # define cpu_has_invlpg 1 diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 316194330da0..10bfe2472d16 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -321,14 +321,22 @@ again: return new_raw_count; } +/* using X86_FEATURE_PERFCTR_CORE to later implement ALTERNATIVE() here */ +static inline int x86_pmu_addr_offset(int index) +{ + if (boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) + return index << 1; + return index; +} + static inline unsigned int x86_pmu_config_addr(int index) { - return x86_pmu.eventsel + index; + return x86_pmu.eventsel + x86_pmu_addr_offset(index); } static inline unsigned int x86_pmu_event_addr(int index) { - return x86_pmu.perfctr + index; + return x86_pmu.perfctr + x86_pmu_addr_offset(index); } static atomic_t active_events; diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 67e2202a6039..461f62bbd774 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -127,6 +127,11 @@ static int amd_pmu_hw_config(struct perf_event *event) /* * AMD64 events are detected based on their event codes. */ +static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc) +{ + return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff); +} + static inline int amd_is_nb_event(struct hw_perf_event *hwc) { return (hwc->config & 0xe0) == 0xe0; @@ -385,13 +390,181 @@ static __initconst const struct x86_pmu amd_pmu = { .cpu_dead = amd_pmu_cpu_dead, }; +/* AMD Family 15h */ + +#define AMD_EVENT_TYPE_MASK 0x000000F0ULL + +#define AMD_EVENT_FP 0x00000000ULL ... 0x00000010ULL +#define AMD_EVENT_LS 0x00000020ULL ... 0x00000030ULL +#define AMD_EVENT_DC 0x00000040ULL ... 0x00000050ULL +#define AMD_EVENT_CU 0x00000060ULL ... 0x00000070ULL +#define AMD_EVENT_IC_DE 0x00000080ULL ... 0x00000090ULL +#define AMD_EVENT_EX_LS 0x000000C0ULL +#define AMD_EVENT_DE 0x000000D0ULL +#define AMD_EVENT_NB 0x000000E0ULL ... 0x000000F0ULL + +/* + * AMD family 15h event code/PMC mappings: + * + * type = event_code & 0x0F0: + * + * 0x000 FP PERF_CTL[5:3] + * 0x010 FP PERF_CTL[5:3] + * 0x020 LS PERF_CTL[5:0] + * 0x030 LS PERF_CTL[5:0] + * 0x040 DC PERF_CTL[5:0] + * 0x050 DC PERF_CTL[5:0] + * 0x060 CU PERF_CTL[2:0] + * 0x070 CU PERF_CTL[2:0] + * 0x080 IC/DE PERF_CTL[2:0] + * 0x090 IC/DE PERF_CTL[2:0] + * 0x0A0 --- + * 0x0B0 --- + * 0x0C0 EX/LS PERF_CTL[5:0] + * 0x0D0 DE PERF_CTL[2:0] + * 0x0E0 NB NB_PERF_CTL[3:0] + * 0x0F0 NB NB_PERF_CTL[3:0] + * + * Exceptions: + * + * 0x003 FP PERF_CTL[3] + * 0x00B FP PERF_CTL[3] + * 0x00D FP PERF_CTL[3] + * 0x023 DE PERF_CTL[2:0] + * 0x02D LS PERF_CTL[3] + * 0x02E LS PERF_CTL[3,0] + * 0x043 CU PERF_CTL[2:0] + * 0x045 CU PERF_CTL[2:0] + * 0x046 CU PERF_CTL[2:0] + * 0x054 CU PERF_CTL[2:0] + * 0x055 CU PERF_CTL[2:0] + * 0x08F IC PERF_CTL[0] + * 0x187 DE PERF_CTL[0] + * 0x188 DE PERF_CTL[0] + * 0x0DB EX PERF_CTL[5:0] + * 0x0DC LS PERF_CTL[5:0] + * 0x0DD LS PERF_CTL[5:0] + * 0x0DE LS PERF_CTL[5:0] + * 0x0DF LS PERF_CTL[5:0] + * 0x1D6 EX PERF_CTL[5:0] + * 0x1D8 EX PERF_CTL[5:0] + */ + +static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0); +static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0); +static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0); +static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT(0, 0x09, 0); +static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0); +static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0); + +static struct event_constraint * +amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + unsigned int event_code = amd_get_event_code(&event->hw); + + switch (event_code & AMD_EVENT_TYPE_MASK) { + case AMD_EVENT_FP: + switch (event_code) { + case 0x003: + case 0x00B: + case 0x00D: + return &amd_f15_PMC3; + default: + return &amd_f15_PMC53; + } + case AMD_EVENT_LS: + case AMD_EVENT_DC: + case AMD_EVENT_EX_LS: + switch (event_code) { + case 0x023: + case 0x043: + case 0x045: + case 0x046: + case 0x054: + case 0x055: + return &amd_f15_PMC20; + case 0x02D: + return &amd_f15_PMC3; + case 0x02E: + return &amd_f15_PMC30; + default: + return &amd_f15_PMC50; + } + case AMD_EVENT_CU: + case AMD_EVENT_IC_DE: + case AMD_EVENT_DE: + switch (event_code) { + case 0x08F: + case 0x187: + case 0x188: + return &amd_f15_PMC0; + case 0x0DB ... 0x0DF: + case 0x1D6: + case 0x1D8: + return &amd_f15_PMC50; + default: + return &amd_f15_PMC20; + } + case AMD_EVENT_NB: + /* not yet implemented */ + return &emptyconstraint; + default: + return &emptyconstraint; + } +} + +static __initconst const struct x86_pmu amd_pmu_f15h = { + .name = "AMD Family 15h", + .handle_irq = x86_pmu_handle_irq, + .disable_all = x86_pmu_disable_all, + .enable_all = x86_pmu_enable_all, + .enable = x86_pmu_enable_event, + .disable = x86_pmu_disable_event, + .hw_config = amd_pmu_hw_config, + .schedule_events = x86_schedule_events, + .eventsel = MSR_F15H_PERF_CTL, + .perfctr = MSR_F15H_PERF_CTR, + .event_map = amd_pmu_event_map, + .max_events = ARRAY_SIZE(amd_perfmon_event_map), + .num_counters = 6, + .cntval_bits = 48, + .cntval_mask = (1ULL << 48) - 1, + .apic = 1, + /* use highest bit to detect overflow */ + .max_period = (1ULL << 47) - 1, + .get_event_constraints = amd_get_event_constraints_f15h, + /* nortbridge counters not yet implemented: */ +#if 0 + .put_event_constraints = amd_put_event_constraints, + + .cpu_prepare = amd_pmu_cpu_prepare, + .cpu_starting = amd_pmu_cpu_starting, + .cpu_dead = amd_pmu_cpu_dead, +#endif +}; + static __init int amd_pmu_init(void) { /* Performance-monitoring supported from K7 and later: */ if (boot_cpu_data.x86 < 6) return -ENODEV; - x86_pmu = amd_pmu; + /* + * If core performance counter extensions exists, it must be + * family 15h, otherwise fail. See x86_pmu_addr_offset(). + */ + switch (boot_cpu_data.x86) { + case 0x15: + if (!cpu_has_perfctr_core) + return -ENODEV; + x86_pmu = amd_pmu_f15h; + break; + default: + if (cpu_has_perfctr_core) + return -ENODEV; + x86_pmu = amd_pmu; + break; + } /* Events are common for all AMDs */ memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, From 163ec4354a5135c6c38c3f4a9b46a31900ebdf48 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Feb 2011 11:22:34 +0100 Subject: [PATCH 266/706] perf: Optimize throttling code By pre-computing the maximum number of samples per tick we can avoid a multiplication and a conditional since MAX_INTERRUPTS > max_samples_per_tick. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 4 ++++ kernel/perf_event.c | 43 +++++++++++++++++++++----------------- kernel/sysctl.c | 2 +- 3 files changed, 29 insertions(+), 20 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 38c8b2554842..8ceb5a6fd9c9 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1110,6 +1110,10 @@ extern int sysctl_perf_event_paranoid; extern int sysctl_perf_event_mlock; extern int sysctl_perf_event_sample_rate; +extern int perf_proc_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); + static inline bool perf_paranoid_tracepoint_raw(void) { return sysctl_perf_event_paranoid > -1; diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 65dcdc76d709..e03be08d0ddf 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -150,7 +150,24 @@ int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ /* * max perf event sample rate */ -int sysctl_perf_event_sample_rate __read_mostly = 100000; +#define DEFAULT_MAX_SAMPLE_RATE 100000 +int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; +static int max_samples_per_tick __read_mostly = + DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); + +int perf_proc_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (ret || !write) + return ret; + + max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); + + return 0; +} static atomic64_t perf_event_id; @@ -4941,26 +4958,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, if (unlikely(!is_sampling_event(event))) return 0; - if (!throttle) { - hwc->interrupts++; - } else { - if (hwc->interrupts != MAX_INTERRUPTS) { - hwc->interrupts++; - if (HZ * hwc->interrupts > - (u64)sysctl_perf_event_sample_rate) { - hwc->interrupts = MAX_INTERRUPTS; - perf_log_throttle(event, 0); - ret = 1; - } - } else { - /* - * Keep re-disabling events even though on the previous - * pass we disabled it - just in case we raced with a - * sched-in and the event got enabled again: - */ + if (unlikely(hwc->interrupts >= max_samples_per_tick)) { + if (throttle) { + hwc->interrupts = MAX_INTERRUPTS; + perf_log_throttle(event, 0); ret = 1; } - } + } else + hwc->interrupts++; if (event->attr.freq) { u64 now = perf_clock(); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0f1bd83db985..daef911cbadb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -948,7 +948,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_perf_event_sample_rate, .maxlen = sizeof(sysctl_perf_event_sample_rate), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = perf_proc_update_handler, }, #endif #ifdef CONFIG_KMEMCHECK From ba3dd36c6775264ee6e7354ba1aabcd6e86d7298 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Feb 2011 12:41:46 +0100 Subject: [PATCH 267/706] perf: Optimize hrtimer events There is no need to re-initialize the hrtimer every time we start it, so don't do that (shaves a few cycles). Also, since we know hrtimers run at a fixed rate (nanoseconds) we can pre-compute the desired frequency at which they tick. This avoids us having to go through the whole adaptive frequency feedback logic (shaves another few cycles). Signed-off-by: Peter Zijlstra LKML-Reference: <1297448589.5226.47.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/kernel/perf_event.c b/kernel/perf_event.c index e03be08d0ddf..a0a6987fabc4 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -5602,6 +5602,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) u64 period; event = container_of(hrtimer, struct perf_event, hw.hrtimer); + + if (event->state != PERF_EVENT_STATE_ACTIVE) + return HRTIMER_NORESTART; + event->pmu->read(event); perf_sample_data_init(&data, 0); @@ -5628,9 +5632,6 @@ static void perf_swevent_start_hrtimer(struct perf_event *event) if (!is_sampling_event(event)) return; - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swevent_hrtimer; - period = local64_read(&hwc->period_left); if (period) { if (period < 0) @@ -5657,6 +5658,30 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event) } } +static void perf_swevent_init_hrtimer(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (!is_sampling_event(event)) + return; + + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swevent_hrtimer; + + /* + * Since hrtimers have a fixed rate, we can do a static freq->period + * mapping and avoid the whole period adjust feedback stuff. + */ + if (event->attr.freq) { + long freq = event->attr.sample_freq; + + event->attr.sample_period = NSEC_PER_SEC / freq; + hwc->sample_period = event->attr.sample_period; + local64_set(&hwc->period_left, hwc->sample_period); + event->attr.freq = 0; + } +} + /* * Software event: cpu wall time clock */ @@ -5709,6 +5734,8 @@ static int cpu_clock_event_init(struct perf_event *event) if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) return -ENOENT; + perf_swevent_init_hrtimer(event); + return 0; } @@ -5787,6 +5814,8 @@ static int task_clock_event_init(struct perf_event *event) if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) return -ENOENT; + perf_swevent_init_hrtimer(event); + return 0; } From 46e49b3836c7cd2ae5b5fe76fa981d0d292a52fe Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Mon, 14 Feb 2011 14:38:50 -0800 Subject: [PATCH 268/706] sched: Wholesale removal of sd_idle logic sd_idle logic was introduced way back in 2005 (commit 5969fe06), as an HT optimization. As per the discussion in the thread here: lkml - sched: Resolve sd_idle and first_idle_cpu Catch-22 - v1 https://patchwork.kernel.org/patch/532501/ The capacity based logic in the load balancer right now handles this in a much cleaner way, handling more than 2 SMT siblings etc, and sd_idle does not seem to bring any additional benefits. sd_idle logic also has some bugs that has performance impact. Here is the patch that removes the sd_idle logic altogether. Also, there was a dependency of sched_mc_power_savings == 2, with sd_idle logic. Signed-off-by: Venkatesh Pallipadi Acked-by: Vaidyanathan Srinivasan Signed-off-by: Peter Zijlstra LKML-Reference: <1297723130-693-1-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 53 ++++++++++----------------------------------- 1 file changed, 11 insertions(+), 42 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 027024694043..d384e739ea95 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2672,7 +2672,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) * @this_cpu: Cpu for which load balance is currently performed. * @idle: Idle status of this_cpu * @load_idx: Load index of sched_domain of this_cpu for load calc. - * @sd_idle: Idle status of the sched_domain containing group. * @local_group: Does group contain this_cpu. * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. @@ -2680,7 +2679,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) */ static inline void update_sg_lb_stats(struct sched_domain *sd, struct sched_group *group, int this_cpu, - enum cpu_idle_type idle, int load_idx, int *sd_idle, + enum cpu_idle_type idle, int load_idx, int local_group, const struct cpumask *cpus, int *balance, struct sg_lb_stats *sgs) { @@ -2700,9 +2699,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, for_each_cpu_and(i, sched_group_cpus(group), cpus) { struct rq *rq = cpu_rq(i); - if (*sd_idle && rq->nr_running) - *sd_idle = 0; - /* Bias balancing toward cpus of our domain */ if (local_group) { if (idle_cpu(i) && !first_idle_cpu) { @@ -2817,15 +2813,13 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, * @sd: sched_domain whose statistics are to be updated. * @this_cpu: Cpu for which load balance is currently performed. * @idle: Idle status of this_cpu - * @sd_idle: Idle status of the sched_domain containing sg. * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sds: variable to hold the statistics for this sched_domain. */ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, - enum cpu_idle_type idle, int *sd_idle, - const struct cpumask *cpus, int *balance, - struct sd_lb_stats *sds) + enum cpu_idle_type idle, const struct cpumask *cpus, + int *balance, struct sd_lb_stats *sds) { struct sched_domain *child = sd->child; struct sched_group *sg = sd->groups; @@ -2843,7 +2837,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); memset(&sgs, 0, sizeof(sgs)); - update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, + update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, local_group, cpus, balance, &sgs); if (local_group && !(*balance)) @@ -3095,7 +3089,6 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, * @imbalance: Variable which stores amount of weighted load which should * be moved to restore balance/put a group to idle. * @idle: The idle status of this_cpu. - * @sd_idle: The idleness of sd * @cpus: The set of CPUs under consideration for load-balancing. * @balance: Pointer to a variable indicating if this_cpu * is the appropriate cpu to perform load balancing at this_level. @@ -3108,7 +3101,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long *imbalance, enum cpu_idle_type idle, - int *sd_idle, const struct cpumask *cpus, int *balance) + const struct cpumask *cpus, int *balance) { struct sd_lb_stats sds; @@ -3118,8 +3111,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * Compute the various statistics relavent for load balancing at * this level. */ - update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, - balance, &sds); + update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); /* Cases where imbalance does not exist from POV of this_cpu */ /* 1) this_cpu is not the appropriate cpu to perform load balancing @@ -3255,7 +3247,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, /* Working cpumask for load_balance and load_balance_newidle. */ static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); -static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, +static int need_active_balance(struct sched_domain *sd, int idle, int busiest_cpu, int this_cpu) { if (idle == CPU_NEWLY_IDLE) { @@ -3287,10 +3279,6 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, * move_tasks() will succeed. ld_moved will be true and this * active balance code will not be triggered. */ - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - return 0; - if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) return 0; } @@ -3308,7 +3296,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *balance) { - int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; + int ld_moved, all_pinned = 0, active_balance = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; @@ -3317,20 +3305,10 @@ static int load_balance(int this_cpu, struct rq *this_rq, cpumask_copy(cpus, cpu_active_mask); - /* - * When power savings policy is enabled for the parent domain, idle - * sibling can pick up load irrespective of busy siblings. In this case, - * let the state of idle sibling percolate up as CPU_IDLE, instead of - * portraying it as CPU_NOT_IDLE. - */ - if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - sd_idle = 1; - schedstat_inc(sd, lb_count[idle]); redo: - group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, + group = find_busiest_group(sd, this_cpu, &imbalance, idle, cpus, balance); if (*balance == 0) @@ -3392,8 +3370,7 @@ redo: if (idle != CPU_NEWLY_IDLE) sd->nr_balance_failed++; - if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), - this_cpu)) { + if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) { raw_spin_lock_irqsave(&busiest->lock, flags); /* don't kick the active_load_balance_cpu_stop, @@ -3448,10 +3425,6 @@ redo: sd->balance_interval *= 2; } - if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - ld_moved = -1; - goto out; out_balanced: @@ -3465,11 +3438,7 @@ out_one_pinned: (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - ld_moved = -1; - else - ld_moved = 0; + ld_moved = 0; out: return ld_moved; } From 48228f7b470a74b6469a250d2977a13128d8fe96 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 8 Feb 2011 12:39:54 -0500 Subject: [PATCH 269/706] lockdep/timers: Explain in detail the locking problems del_timer_sync() may cause Twice I had to explain the output about why lockdep gives an error with locks in IRQ context and with del_timer_sync(). Might as well write it up and place it in the comments above the code in del_timer_sync(). Perhaps the next time this lockdep dump triggers people will understand the issues. It is a ticky issue and very subtle, explaining it in detail in the code may help others understand the issue when they stumble upon the bug again. Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra LKML-Reference: <1297186794.23343.19.camel@gandalf.stny.rr.com> Signed-off-by: Ingo Molnar --- kernel/timer.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/kernel/timer.c b/kernel/timer.c index d6459923d245..5f40c2e0a94e 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -964,6 +964,25 @@ EXPORT_SYMBOL(try_to_del_timer_sync); * add_timer_on(). Upon exit the timer is not queued and the handler is * not running on any CPU. * + * Note: You must not hold locks that are held in interrupt context + * while calling this function. Even if the lock has nothing to do + * with the timer in question. Here's why: + * + * CPU0 CPU1 + * ---- ---- + * + * call_timer_fn(); + * base->running_timer = mytimer; + * spin_lock_irq(somelock); + * + * spin_lock(somelock); + * del_timer_sync(mytimer); + * while (base->running_timer == mytimer); + * + * Now del_timer_sync() will never return and never release somelock. + * The interrupt on the other CPU is waiting to grab somelock but + * it has interrupted the softirq that CPU0 is waiting to finish. + * * The function returns whether it has deactivated a pending timer or not. */ int del_timer_sync(struct timer_list *timer) @@ -971,6 +990,10 @@ int del_timer_sync(struct timer_list *timer) #ifdef CONFIG_LOCKDEP unsigned long flags; + /* + * If lockdep gives a backtrace here, please reference + * the synchronization rules above. + */ local_irq_save(flags); lock_map_acquire(&timer->lockdep_map); lock_map_release(&timer->lockdep_map); From ef396ec96c1a8ffd2b0bc67f1f79c7274de02b95 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 17:11:07 +0100 Subject: [PATCH 270/706] x86-64, NUMA: Factor out memblk handling into numa_{add|register}_memblk() Factor out memblk handling from srat_64.c into two functions in numa_64.c. This patch doesn't introduce any behavior change. The next patch will make all init methods use these functions. - v2: Fixed build failure on 32bit due to misplaced NR_NODE_MEMBLKS. Reported by Ingo. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/include/asm/acpi.h | 1 - arch/x86/include/asm/numa.h | 3 + arch/x86/include/asm/numa_64.h | 2 + arch/x86/mm/numa_64.c | 109 +++++++++++++++++++++++++++++++++ arch/x86/mm/srat_64.c | 96 +---------------------------- 5 files changed, 117 insertions(+), 94 deletions(-) diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 446a5b9a1d5f..12bd1fdd206b 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -187,7 +187,6 @@ struct bootnode; extern int acpi_numa; extern int x86_acpi_numa_init(void); extern int acpi_scan_nodes(void); -#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) #ifdef CONFIG_NUMA_EMU extern void acpi_fake_nodes(const struct bootnode *fake_nodes, diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 26fc6e2dd0fb..3d4dab43c994 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -5,6 +5,9 @@ #include #ifdef CONFIG_NUMA + +#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) + /* * __apicid_to_node[] stores the raw mapping between physical apicid and * node and is used to initialize cpu_to_node mapping. diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index d3a45147d09b..3306a2b99ece 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -32,6 +32,8 @@ extern nodemask_t mem_nodes_parsed __initdata; extern struct bootnode numa_nodes[MAX_NUMNODES] __initdata; extern int __cpuinit numa_cpu_node(int cpu); +extern int __init numa_add_memblk(int nodeid, u64 start, u64 end); +extern int __init numa_register_memblks(void); #ifdef CONFIG_NUMA_EMU #define FAKE_NODE_MIN_SIZE ((u64)32 << 20) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 82ee3083b094..a1d702d2584c 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -33,6 +33,10 @@ struct memnode memnode; static unsigned long __initdata nodemap_addr; static unsigned long __initdata nodemap_size; +static int num_node_memblks __initdata; +static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata; +static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata; + struct bootnode numa_nodes[MAX_NUMNODES] __initdata; /* @@ -184,6 +188,43 @@ static void * __init early_node_mem(int nodeid, unsigned long start, return NULL; } +static __init int conflicting_memblks(unsigned long start, unsigned long end) +{ + int i; + for (i = 0; i < num_node_memblks; i++) { + struct bootnode *nd = &node_memblk_range[i]; + if (nd->start == nd->end) + continue; + if (nd->end > start && nd->start < end) + return memblk_nodeid[i]; + if (nd->end == end && nd->start == start) + return memblk_nodeid[i]; + } + return -1; +} + +int __init numa_add_memblk(int nid, u64 start, u64 end) +{ + int i; + + i = conflicting_memblks(start, end); + if (i == nid) { + printk(KERN_WARNING "NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n", + nid, start, end, numa_nodes[i].start, numa_nodes[i].end); + } else if (i >= 0) { + printk(KERN_ERR "NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n", + nid, start, end, i, + numa_nodes[i].start, numa_nodes[i].end); + return -EINVAL; + } + + node_memblk_range[num_node_memblks].start = start; + node_memblk_range[num_node_memblks].end = end; + memblk_nodeid[num_node_memblks] = nid; + num_node_memblks++; + return 0; +} + static __init void cutoff_node(int i, unsigned long start, unsigned long end) { struct bootnode *nd = &numa_nodes[i]; @@ -246,6 +287,71 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) node_set_online(nodeid); } +int __init numa_register_memblks(void) +{ + int i; + + /* + * Join together blocks on the same node, holes between + * which don't overlap with memory on other nodes. + */ + for (i = 0; i < num_node_memblks; ++i) { + int j, k; + + for (j = i + 1; j < num_node_memblks; ++j) { + unsigned long start, end; + + if (memblk_nodeid[i] != memblk_nodeid[j]) + continue; + start = min(node_memblk_range[i].end, + node_memblk_range[j].end); + end = max(node_memblk_range[i].start, + node_memblk_range[j].start); + for (k = 0; k < num_node_memblks; ++k) { + if (memblk_nodeid[i] == memblk_nodeid[k]) + continue; + if (start < node_memblk_range[k].end && + end > node_memblk_range[k].start) + break; + } + if (k < num_node_memblks) + continue; + start = min(node_memblk_range[i].start, + node_memblk_range[j].start); + end = max(node_memblk_range[i].end, + node_memblk_range[j].end); + printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n", + memblk_nodeid[i], + node_memblk_range[i].start, + node_memblk_range[i].end, + node_memblk_range[j].start, + node_memblk_range[j].end, + start, end); + node_memblk_range[i].start = start; + node_memblk_range[i].end = end; + k = --num_node_memblks - j; + memmove(memblk_nodeid + j, memblk_nodeid + j+1, + k * sizeof(*memblk_nodeid)); + memmove(node_memblk_range + j, node_memblk_range + j+1, + k * sizeof(*node_memblk_range)); + --j; + } + } + + memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks, + memblk_nodeid); + if (memnode_shift < 0) { + printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n"); + return -EINVAL; + } + + for (i = 0; i < num_node_memblks; i++) + memblock_x86_register_active_regions(memblk_nodeid[i], + node_memblk_range[i].start >> PAGE_SHIFT, + node_memblk_range[i].end >> PAGE_SHIFT); + return 0; +} + #ifdef CONFIG_NUMA_EMU /* Numa emulation */ static struct bootnode nodes[MAX_NUMNODES] __initdata; @@ -653,6 +759,9 @@ void __init initmem_init(void) nodes_clear(mem_nodes_parsed); nodes_clear(node_possible_map); nodes_clear(node_online_map); + num_node_memblks = 0; + memset(node_memblk_range, 0, sizeof(node_memblk_range)); + memset(memblk_nodeid, 0, sizeof(memblk_nodeid)); memset(numa_nodes, 0, sizeof(numa_nodes)); if (numa_init[i]() < 0) diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 82b1087963a2..341b37193c76 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -30,30 +30,11 @@ static struct acpi_table_slit *acpi_slit; static struct bootnode nodes_add[MAX_NUMNODES]; -static int num_node_memblks __initdata; -static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata; -static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata; - static __init int setup_node(int pxm) { return acpi_map_pxm_to_node(pxm); } -static __init int conflicting_memblks(unsigned long start, unsigned long end) -{ - int i; - for (i = 0; i < num_node_memblks; i++) { - struct bootnode *nd = &node_memblk_range[i]; - if (nd->start == nd->end) - continue; - if (nd->end > start && nd->start < end) - return memblk_nodeid[i]; - if (nd->end == end && nd->start == start) - return memblk_nodeid[i]; - } - return -1; -} - static __init void bad_srat(void) { int i; @@ -233,7 +214,6 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) struct bootnode *nd; unsigned long start, end; int node, pxm; - int i; if (srat_disabled()) return; @@ -255,16 +235,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) bad_srat(); return; } - i = conflicting_memblks(start, end); - if (i == node) { - printk(KERN_WARNING - "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n", - pxm, start, end, numa_nodes[i].start, numa_nodes[i].end); - } else if (i >= 0) { - printk(KERN_ERR - "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n", - pxm, start, end, node_to_pxm(i), - numa_nodes[i].start, numa_nodes[i].end); + + if (numa_add_memblk(node, start, end) < 0) { bad_srat(); return; } @@ -285,11 +257,6 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) } } else update_nodes_add(node, start, end); - - node_memblk_range[num_node_memblks].start = start; - node_memblk_range[num_node_memblks].end = end; - memblk_nodeid[num_node_memblks] = node; - num_node_memblks++; } /* Sanity check to catch more bad SRATs (they are amazingly common). @@ -341,68 +308,11 @@ int __init acpi_scan_nodes(void) if (acpi_numa <= 0) return -1; - /* - * Join together blocks on the same node, holes between - * which don't overlap with memory on other nodes. - */ - for (i = 0; i < num_node_memblks; ++i) { - int j, k; - - for (j = i + 1; j < num_node_memblks; ++j) { - unsigned long start, end; - - if (memblk_nodeid[i] != memblk_nodeid[j]) - continue; - start = min(node_memblk_range[i].end, - node_memblk_range[j].end); - end = max(node_memblk_range[i].start, - node_memblk_range[j].start); - for (k = 0; k < num_node_memblks; ++k) { - if (memblk_nodeid[i] == memblk_nodeid[k]) - continue; - if (start < node_memblk_range[k].end && - end > node_memblk_range[k].start) - break; - } - if (k < num_node_memblks) - continue; - start = min(node_memblk_range[i].start, - node_memblk_range[j].start); - end = max(node_memblk_range[i].end, - node_memblk_range[j].end); - printk(KERN_INFO "SRAT: Node %d " - "[%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n", - memblk_nodeid[i], - node_memblk_range[i].start, - node_memblk_range[i].end, - node_memblk_range[j].start, - node_memblk_range[j].end, - start, end); - node_memblk_range[i].start = start; - node_memblk_range[i].end = end; - k = --num_node_memblks - j; - memmove(memblk_nodeid + j, memblk_nodeid + j+1, - k * sizeof(*memblk_nodeid)); - memmove(node_memblk_range + j, node_memblk_range + j+1, - k * sizeof(*node_memblk_range)); - --j; - } - } - - memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks, - memblk_nodeid); - if (memnode_shift < 0) { - printk(KERN_ERR - "SRAT: No NUMA node hash function found. Contact maintainer\n"); + if (numa_register_memblks() < 0) { bad_srat(); return -1; } - for (i = 0; i < num_node_memblks; i++) - memblock_x86_register_active_regions(memblk_nodeid[i], - node_memblk_range[i].start >> PAGE_SHIFT, - node_memblk_range[i].end >> PAGE_SHIFT); - /* for out of order entries in SRAT */ sort_node_map(); if (!nodes_cover_memory(numa_nodes)) { From 43a662f04f731c331706456c9852ef7146ba5d85 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 17:11:08 +0100 Subject: [PATCH 271/706] x86-64, NUMA: Unify use of memblk in all init methods Make both amd and dummy use numa_add_memblk() to describe the detected memory blocks. This allows initmem_init() to call numa_register_memblk() regardless of init method in use. Drop custom memory registration codes from amd and dummy. After this change, memblk merge/cleanup in numa_register_memblks() is applied to all init methods. As this makes compute_hash_shift() and numa_register_memblks() used only inside numa_64.c, make them static. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/include/asm/numa_64.h | 4 ---- arch/x86/mm/amdtopology_64.c | 13 +------------ arch/x86/mm/numa_64.c | 15 +++++++-------- arch/x86/mm/srat_64.c | 5 ----- 4 files changed, 8 insertions(+), 29 deletions(-) diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 3306a2b99ece..e925605150a0 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -8,9 +8,6 @@ struct bootnode { u64 end; }; -extern int compute_hash_shift(struct bootnode *nodes, int numblks, - int *nodeids); - #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) extern int numa_off; @@ -33,7 +30,6 @@ extern struct bootnode numa_nodes[MAX_NUMNODES] __initdata; extern int __cpuinit numa_cpu_node(int cpu); extern int __init numa_add_memblk(int nodeid, u64 start, u64 end); -extern int __init numa_register_memblks(void); #ifdef CONFIG_NUMA_EMU #define FAKE_NODE_MIN_SIZE ((u64)32 << 20) diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c index cf29527885f8..d6d7aa4b98c6 100644 --- a/arch/x86/mm/amdtopology_64.c +++ b/arch/x86/mm/amdtopology_64.c @@ -167,6 +167,7 @@ int __init amd_numa_init(void) numa_nodes[nodeid].start = base; numa_nodes[nodeid].end = limit; + numa_add_memblk(nodeid, base, limit); prevbase = base; @@ -263,18 +264,6 @@ int __init amd_scan_nodes(void) { int i; - memnode_shift = compute_hash_shift(numa_nodes, 8, NULL); - if (memnode_shift < 0) { - pr_err("No NUMA node hash function found. Contact maintainer\n"); - return -1; - } - pr_info("Using node hash shift of %d\n", memnode_shift); - - /* use the coreid bits from early_identify_cpu */ - for_each_node_mask(i, node_possible_map) - memblock_x86_register_active_regions(i, - numa_nodes[i].start >> PAGE_SHIFT, - numa_nodes[i].end >> PAGE_SHIFT); init_memory_mapping_high(); for_each_node_mask(i, node_possible_map) setup_node_bootmem(i, numa_nodes[i].start, numa_nodes[i].end); diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index a1d702d2584c..552080e8472b 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -131,8 +131,8 @@ static int __init extract_lsb_from_nodes(const struct bootnode *nodes, return i; } -int __init compute_hash_shift(struct bootnode *nodes, int numnodes, - int *nodeids) +static int __init compute_hash_shift(struct bootnode *nodes, int numnodes, + int *nodeids) { int shift; @@ -287,7 +287,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) node_set_online(nodeid); } -int __init numa_register_memblks(void) +static int __init numa_register_memblks(void) { int i; @@ -713,17 +713,13 @@ static int dummy_numa_init(void) node_set(0, cpu_nodes_parsed); node_set(0, mem_nodes_parsed); + numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT); return 0; } static int dummy_scan_nodes(void) { - /* setup dummy node covering all memory */ - memnode_shift = 63; - memnodemap = memnode.embedded_map; - memnodemap[0] = 0; - memblock_x86_register_active_regions(0, 0, max_pfn); init_memory_mapping_high(); setup_node_bootmem(0, 0, max_pfn << PAGE_SHIFT); numa_init_array(); @@ -784,6 +780,9 @@ void __init initmem_init(void) if (WARN_ON(nodes_empty(node_possible_map))) continue; + if (numa_register_memblks() < 0) + continue; + if (!scan_nodes[i]()) return; } diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 341b37193c76..69f147116da7 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -308,11 +308,6 @@ int __init acpi_scan_nodes(void) if (acpi_numa <= 0) return -1; - if (numa_register_memblks() < 0) { - bad_srat(); - return -1; - } - /* for out of order entries in SRAT */ sort_node_map(); if (!nodes_cover_memory(numa_nodes)) { From fd0435d8fb1d4e5771f9ae3af71f2a77c1f4bd09 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 17:11:08 +0100 Subject: [PATCH 272/706] x86-64, NUMA: Unify the rest of memblk registration Move the remaining memblk registration logic from acpi_scan_nodes() to numa_register_memblks() and initmem_init(). This applies nodes_cover_memory() sanity check, memory node sorting and node_online() checking, which were only applied to acpi, to all init methods. As all memblk registration is moved to common code, active range clearing is moved to initmem_init() too and removed from bad_srat(). Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/mm/amdtopology_64.c | 7 ---- arch/x86/mm/numa_64.c | 74 +++++++++++++++++++++++++++++++++--- arch/x86/mm/srat_64.c | 61 ----------------------------- 3 files changed, 68 insertions(+), 74 deletions(-) diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c index d6d7aa4b98c6..9c9f46adf414 100644 --- a/arch/x86/mm/amdtopology_64.c +++ b/arch/x86/mm/amdtopology_64.c @@ -262,12 +262,5 @@ void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes) int __init amd_scan_nodes(void) { - int i; - - init_memory_mapping_high(); - for_each_node_mask(i, node_possible_map) - setup_node_bootmem(i, numa_nodes[i].start, numa_nodes[i].end); - - numa_init_array(); return 0; } diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 552080e8472b..748c6b5bff6d 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -287,6 +287,37 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) node_set_online(nodeid); } +/* + * Sanity check to catch more bad NUMA configurations (they are amazingly + * common). Make sure the nodes cover all memory. + */ +static int __init nodes_cover_memory(const struct bootnode *nodes) +{ + unsigned long numaram, e820ram; + int i; + + numaram = 0; + for_each_node_mask(i, mem_nodes_parsed) { + unsigned long s = nodes[i].start >> PAGE_SHIFT; + unsigned long e = nodes[i].end >> PAGE_SHIFT; + numaram += e - s; + numaram -= __absent_pages_in_range(i, s, e); + if ((long)numaram < 0) + numaram = 0; + } + + e820ram = max_pfn - + (memblock_x86_hole_size(0, max_pfn<> PAGE_SHIFT); + /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ + if ((long)(e820ram - numaram) >= (1<<(20 - PAGE_SHIFT))) { + printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n", + (numaram << PAGE_SHIFT) >> 20, + (e820ram << PAGE_SHIFT) >> 20); + return 0; + } + return 1; +} + static int __init numa_register_memblks(void) { int i; @@ -349,6 +380,27 @@ static int __init numa_register_memblks(void) memblock_x86_register_active_regions(memblk_nodeid[i], node_memblk_range[i].start >> PAGE_SHIFT, node_memblk_range[i].end >> PAGE_SHIFT); + + /* for out of order entries */ + sort_node_map(); + if (!nodes_cover_memory(numa_nodes)) + return -EINVAL; + + init_memory_mapping_high(); + + /* Finally register nodes. */ + for_each_node_mask(i, node_possible_map) + setup_node_bootmem(i, numa_nodes[i].start, numa_nodes[i].end); + + /* + * Try again in case setup_node_bootmem missed one due to missing + * bootmem. + */ + for_each_node_mask(i, node_possible_map) + if (!node_online(i)) + setup_node_bootmem(i, numa_nodes[i].start, + numa_nodes[i].end); + return 0; } @@ -714,16 +766,14 @@ static int dummy_numa_init(void) node_set(0, cpu_nodes_parsed); node_set(0, mem_nodes_parsed); numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT); + numa_nodes[0].start = 0; + numa_nodes[0].end = (u64)max_pfn << PAGE_SHIFT; return 0; } static int dummy_scan_nodes(void) { - init_memory_mapping_high(); - setup_node_bootmem(0, 0, max_pfn << PAGE_SHIFT); - numa_init_array(); - return 0; } @@ -759,6 +809,7 @@ void __init initmem_init(void) memset(node_memblk_range, 0, sizeof(node_memblk_range)); memset(memblk_nodeid, 0, sizeof(memblk_nodeid)); memset(numa_nodes, 0, sizeof(numa_nodes)); + remove_all_active_ranges(); if (numa_init[i]() < 0) continue; @@ -783,8 +834,19 @@ void __init initmem_init(void) if (numa_register_memblks() < 0) continue; - if (!scan_nodes[i]()) - return; + if (scan_nodes[i]() < 0) + continue; + + for (j = 0; j < nr_cpu_ids; j++) { + int nid = early_cpu_to_node(j); + + if (nid == NUMA_NO_NODE) + continue; + if (!node_online(nid)) + numa_clear_node(j); + } + numa_init_array(); + return; } BUG(); } diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 69f147116da7..4a2c33b0a48c 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -44,7 +44,6 @@ static __init void bad_srat(void) numa_nodes[i].start = numa_nodes[i].end = 0; nodes_add[i].start = nodes_add[i].end = 0; } - remove_all_active_ranges(); } static __init inline int srat_disabled(void) @@ -259,35 +258,6 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) update_nodes_add(node, start, end); } -/* Sanity check to catch more bad SRATs (they are amazingly common). - Make sure the PXMs cover all memory. */ -static int __init nodes_cover_memory(const struct bootnode *nodes) -{ - int i; - unsigned long pxmram, e820ram; - - pxmram = 0; - for_each_node_mask(i, mem_nodes_parsed) { - unsigned long s = nodes[i].start >> PAGE_SHIFT; - unsigned long e = nodes[i].end >> PAGE_SHIFT; - pxmram += e - s; - pxmram -= __absent_pages_in_range(i, s, e); - if ((long)pxmram < 0) - pxmram = 0; - } - - e820ram = max_pfn - (memblock_x86_hole_size(0, max_pfn<>PAGE_SHIFT); - /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ - if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) { - printk(KERN_ERR - "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n", - (pxmram << PAGE_SHIFT) >> 20, - (e820ram << PAGE_SHIFT) >> 20); - return 0; - } - return 1; -} - void __init acpi_numa_arch_fixup(void) {} int __init x86_acpi_numa_init(void) @@ -303,39 +273,8 @@ int __init x86_acpi_numa_init(void) /* Use the information discovered above to actually set up the nodes. */ int __init acpi_scan_nodes(void) { - int i; - if (acpi_numa <= 0) return -1; - - /* for out of order entries in SRAT */ - sort_node_map(); - if (!nodes_cover_memory(numa_nodes)) { - bad_srat(); - return -1; - } - - init_memory_mapping_high(); - - /* Finally register nodes */ - for_each_node_mask(i, node_possible_map) - setup_node_bootmem(i, numa_nodes[i].start, numa_nodes[i].end); - /* Try again in case setup_node_bootmem missed one due - to missing bootmem */ - for_each_node_mask(i, node_possible_map) - if (!node_online(i)) - setup_node_bootmem(i, numa_nodes[i].start, - numa_nodes[i].end); - - for (i = 0; i < nr_cpu_ids; i++) { - int node = early_cpu_to_node(i); - - if (node == NUMA_NO_NODE) - continue; - if (!node_online(node)) - numa_clear_node(i); - } - numa_init_array(); return 0; } From 5d371b08fea80c4fb7450d31e5a4e35b438ef850 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 17:11:08 +0100 Subject: [PATCH 273/706] x86-64, NUMA: Kill {acpi|amd|dummy}_scan_nodes() They are empty now. Kill them. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/include/asm/acpi.h | 1 - arch/x86/include/asm/amd_nb.h | 1 - arch/x86/mm/amdtopology_64.c | 5 ----- arch/x86/mm/numa_64.c | 11 ----------- arch/x86/mm/srat_64.c | 8 -------- 5 files changed, 26 deletions(-) diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 12bd1fdd206b..cfa3d5c3144a 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -186,7 +186,6 @@ struct bootnode; #ifdef CONFIG_ACPI_NUMA extern int acpi_numa; extern int x86_acpi_numa_init(void); -extern int acpi_scan_nodes(void); #ifdef CONFIG_NUMA_EMU extern void acpi_fake_nodes(const struct bootnode *fake_nodes, diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 246cdc6ca9ba..384d1188e787 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -17,7 +17,6 @@ extern int early_is_amd_nb(u32 value); extern int amd_cache_northbridges(void); extern void amd_flush_garts(void); extern int amd_numa_init(void); -extern int amd_scan_nodes(void); extern int amd_get_subcaches(int); extern int amd_set_subcaches(int, int); diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c index 9c9f46adf414..90cf297b3b59 100644 --- a/arch/x86/mm/amdtopology_64.c +++ b/arch/x86/mm/amdtopology_64.c @@ -259,8 +259,3 @@ void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes) memcpy(__apicid_to_node, fake_apicid_to_node, sizeof(__apicid_to_node)); } #endif /* CONFIG_NUMA_EMU */ - -int __init amd_scan_nodes(void) -{ - return 0; -} diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 748c6b5bff6d..e211c005f5e1 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -772,25 +772,17 @@ static int dummy_numa_init(void) return 0; } -static int dummy_scan_nodes(void) -{ - return 0; -} - void __init initmem_init(void) { int (*numa_init[])(void) = { [2] = dummy_numa_init }; - int (*scan_nodes[])(void) = { [2] = dummy_scan_nodes }; int i, j; if (!numa_off) { #ifdef CONFIG_ACPI_NUMA numa_init[0] = x86_acpi_numa_init; - scan_nodes[0] = acpi_scan_nodes; #endif #ifdef CONFIG_AMD_NUMA numa_init[1] = amd_numa_init; - scan_nodes[1] = amd_scan_nodes; #endif } @@ -834,9 +826,6 @@ void __init initmem_init(void) if (numa_register_memblks() < 0) continue; - if (scan_nodes[i]() < 0) - continue; - for (j = 0; j < nr_cpu_ids; j++) { int nid = early_cpu_to_node(j); diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 4a2c33b0a48c..d56eff811cd8 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -270,14 +270,6 @@ int __init x86_acpi_numa_init(void) return srat_disabled() ? -EINVAL : 0; } -/* Use the information discovered above to actually set up the nodes. */ -int __init acpi_scan_nodes(void) -{ - if (acpi_numa <= 0) - return -1; - return 0; -} - #ifdef CONFIG_NUMA_EMU static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = { [0 ... MAX_NUMNODES-1] = PXM_INVAL From 8968dab8ad90ea16ef92f2406868354ea3ab6bb9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 17:11:08 +0100 Subject: [PATCH 274/706] x86-64, NUMA: Remove %NULL @nodeids handling from compute_hash_shift() numa_emulation() called compute_hash_shift() with %NULL @nodeids which meant identity mapping between index and nodeid. Make numa_emulation() build identity array and drop %NULL @nodeids handling from populate_memnodemap() and thus from compute_hash_shift(). This is to prepare for transition to using memblks instead. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/mm/numa_64.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index e211c005f5e1..243d18d4cfde 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -63,12 +63,7 @@ static int __init populate_memnodemap(const struct bootnode *nodes, do { if (memnodemap[addr >> shift] != NUMA_NO_NODE) return -1; - - if (!nodeids) - memnodemap[addr >> shift] = i; - else - memnodemap[addr >> shift] = nodeids[i]; - + memnodemap[addr >> shift] = nodeids[i]; addr += (1UL << shift); } while (addr < end); res = 1; @@ -706,6 +701,7 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn, int acpi, int amd) { + static int nodeid[NR_NODE_MEMBLKS] __initdata; u64 addr = start_pfn << PAGE_SHIFT; u64 max_addr = last_pfn << PAGE_SHIFT; int num_nodes; @@ -730,7 +726,11 @@ static int __init numa_emulation(unsigned long start_pfn, if (num_nodes < 0) return num_nodes; - memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); + + for (i = 0; i < ARRAY_SIZE(nodeid); i++) + nodeid[i] = i; + + memnode_shift = compute_hash_shift(nodes, num_nodes, nodeid); if (memnode_shift < 0) { memnode_shift = 0; printk(KERN_ERR "No NUMA hash function found. NUMA emulation " From 97e7b78d0674882a0aae043fda428c583dbb225d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 17:11:08 +0100 Subject: [PATCH 275/706] x86-64, NUMA: Introduce struct numa_meminfo Arrays for memblks and nodeids and their length lived in separate variables making things unnecessarily cumbersome. Introduce struct numa_meminfo which contains all memory configuration info. This patch doesn't cause any behavior change. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/mm/numa_64.c | 145 ++++++++++++++++++++++-------------------- 1 file changed, 75 insertions(+), 70 deletions(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 243d18d4cfde..c3496e2b5a71 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -22,6 +22,17 @@ #include #include +struct numa_memblk { + u64 start; + u64 end; + int nid; +}; + +struct numa_meminfo { + int nr_blks; + struct numa_memblk blk[NR_NODE_MEMBLKS]; +}; + struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); @@ -33,9 +44,7 @@ struct memnode memnode; static unsigned long __initdata nodemap_addr; static unsigned long __initdata nodemap_size; -static int num_node_memblks __initdata; -static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata; -static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata; +static struct numa_meminfo numa_meminfo __initdata; struct bootnode numa_nodes[MAX_NUMNODES] __initdata; @@ -46,16 +55,15 @@ struct bootnode numa_nodes[MAX_NUMNODES] __initdata; * 0 if memnodmap[] too small (of shift too small) * -1 if node overlap or lost ram (shift too big) */ -static int __init populate_memnodemap(const struct bootnode *nodes, - int numnodes, int shift, int *nodeids) +static int __init populate_memnodemap(const struct numa_meminfo *mi, int shift) { unsigned long addr, end; int i, res = -1; memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize); - for (i = 0; i < numnodes; i++) { - addr = nodes[i].start; - end = nodes[i].end; + for (i = 0; i < mi->nr_blks; i++) { + addr = mi->blk[i].start; + end = mi->blk[i].end; if (addr >= end) continue; if ((end >> shift) >= memnodemapsize) @@ -63,7 +71,7 @@ static int __init populate_memnodemap(const struct bootnode *nodes, do { if (memnodemap[addr >> shift] != NUMA_NO_NODE) return -1; - memnodemap[addr >> shift] = nodeids[i]; + memnodemap[addr >> shift] = mi->blk[i].nid; addr += (1UL << shift); } while (addr < end); res = 1; @@ -101,16 +109,15 @@ static int __init allocate_cachealigned_memnodemap(void) * The LSB of all start and end addresses in the node map is the value of the * maximum possible shift. */ -static int __init extract_lsb_from_nodes(const struct bootnode *nodes, - int numnodes) +static int __init extract_lsb_from_nodes(const struct numa_meminfo *mi) { int i, nodes_used = 0; unsigned long start, end; unsigned long bitfield = 0, memtop = 0; - for (i = 0; i < numnodes; i++) { - start = nodes[i].start; - end = nodes[i].end; + for (i = 0; i < mi->nr_blks; i++) { + start = mi->blk[i].start; + end = mi->blk[i].end; if (start >= end) continue; bitfield |= start; @@ -126,18 +133,17 @@ static int __init extract_lsb_from_nodes(const struct bootnode *nodes, return i; } -static int __init compute_hash_shift(struct bootnode *nodes, int numnodes, - int *nodeids) +static int __init compute_hash_shift(const struct numa_meminfo *mi) { int shift; - shift = extract_lsb_from_nodes(nodes, numnodes); + shift = extract_lsb_from_nodes(mi); if (allocate_cachealigned_memnodemap()) return -1; printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", shift); - if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) { + if (populate_memnodemap(mi, shift) != 1) { printk(KERN_INFO "Your memory is not aligned you need to " "rebuild your kernel with a bigger NODEMAPSIZE " "shift=%d\n", shift); @@ -185,21 +191,25 @@ static void * __init early_node_mem(int nodeid, unsigned long start, static __init int conflicting_memblks(unsigned long start, unsigned long end) { + struct numa_meminfo *mi = &numa_meminfo; int i; - for (i = 0; i < num_node_memblks; i++) { - struct bootnode *nd = &node_memblk_range[i]; - if (nd->start == nd->end) + + for (i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *blk = &mi->blk[i]; + + if (blk->start == blk->end) continue; - if (nd->end > start && nd->start < end) - return memblk_nodeid[i]; - if (nd->end == end && nd->start == start) - return memblk_nodeid[i]; + if (blk->end > start && blk->start < end) + return blk->nid; + if (blk->end == end && blk->start == start) + return blk->nid; } return -1; } int __init numa_add_memblk(int nid, u64 start, u64 end) { + struct numa_meminfo *mi = &numa_meminfo; int i; i = conflicting_memblks(start, end); @@ -213,10 +223,10 @@ int __init numa_add_memblk(int nid, u64 start, u64 end) return -EINVAL; } - node_memblk_range[num_node_memblks].start = start; - node_memblk_range[num_node_memblks].end = end; - memblk_nodeid[num_node_memblks] = nid; - num_node_memblks++; + mi->blk[mi->nr_blks].start = start; + mi->blk[mi->nr_blks].end = end; + mi->blk[mi->nr_blks].nid = nid; + mi->nr_blks++; return 0; } @@ -315,66 +325,59 @@ static int __init nodes_cover_memory(const struct bootnode *nodes) static int __init numa_register_memblks(void) { + struct numa_meminfo *mi = &numa_meminfo; int i; /* * Join together blocks on the same node, holes between * which don't overlap with memory on other nodes. */ - for (i = 0; i < num_node_memblks; ++i) { + for (i = 0; i < mi->nr_blks; ++i) { + struct numa_memblk *bi = &mi->blk[i]; int j, k; - for (j = i + 1; j < num_node_memblks; ++j) { + for (j = i + 1; j < mi->nr_blks; ++j) { + struct numa_memblk *bj = &mi->blk[j]; unsigned long start, end; - if (memblk_nodeid[i] != memblk_nodeid[j]) + if (bi->nid != bj->nid) continue; - start = min(node_memblk_range[i].end, - node_memblk_range[j].end); - end = max(node_memblk_range[i].start, - node_memblk_range[j].start); - for (k = 0; k < num_node_memblks; ++k) { - if (memblk_nodeid[i] == memblk_nodeid[k]) + start = min(bi->end, bj->end); + end = max(bi->start, bj->start); + for (k = 0; k < mi->nr_blks; ++k) { + struct numa_memblk *bk = &mi->blk[k]; + + if (bi->nid == bk->nid) continue; - if (start < node_memblk_range[k].end && - end > node_memblk_range[k].start) + if (start < bk->end && end > bk->start) break; } - if (k < num_node_memblks) + if (k < mi->nr_blks) continue; - start = min(node_memblk_range[i].start, - node_memblk_range[j].start); - end = max(node_memblk_range[i].end, - node_memblk_range[j].end); + start = min(bi->start, bj->start); + end = max(bi->end, bj->end); printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n", - memblk_nodeid[i], - node_memblk_range[i].start, - node_memblk_range[i].end, - node_memblk_range[j].start, - node_memblk_range[j].end, + bi->nid, bi->start, bi->end, bj->start, bj->end, start, end); - node_memblk_range[i].start = start; - node_memblk_range[i].end = end; - k = --num_node_memblks - j; - memmove(memblk_nodeid + j, memblk_nodeid + j+1, - k * sizeof(*memblk_nodeid)); - memmove(node_memblk_range + j, node_memblk_range + j+1, - k * sizeof(*node_memblk_range)); + bi->start = start; + bi->end = end; + k = --mi->nr_blks - j; + memmove(mi->blk + j, mi->blk + j + 1, + k * sizeof(mi->blk[0])); --j; } } - memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks, - memblk_nodeid); + memnode_shift = compute_hash_shift(mi); if (memnode_shift < 0) { printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n"); return -EINVAL; } - for (i = 0; i < num_node_memblks; i++) - memblock_x86_register_active_regions(memblk_nodeid[i], - node_memblk_range[i].start >> PAGE_SHIFT, - node_memblk_range[i].end >> PAGE_SHIFT); + for (i = 0; i < mi->nr_blks; i++) + memblock_x86_register_active_regions(mi->blk[i].nid, + mi->blk[i].start >> PAGE_SHIFT, + mi->blk[i].end >> PAGE_SHIFT); /* for out of order entries */ sort_node_map(); @@ -701,7 +704,7 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn, int acpi, int amd) { - static int nodeid[NR_NODE_MEMBLKS] __initdata; + static struct numa_meminfo ei __initdata; u64 addr = start_pfn << PAGE_SHIFT; u64 max_addr = last_pfn << PAGE_SHIFT; int num_nodes; @@ -727,10 +730,14 @@ static int __init numa_emulation(unsigned long start_pfn, if (num_nodes < 0) return num_nodes; - for (i = 0; i < ARRAY_SIZE(nodeid); i++) - nodeid[i] = i; + ei.nr_blks = num_nodes; + for (i = 0; i < ei.nr_blks; i++) { + ei.blk[i].start = nodes[i].start; + ei.blk[i].end = nodes[i].end; + ei.blk[i].nid = i; + } - memnode_shift = compute_hash_shift(nodes, num_nodes, nodeid); + memnode_shift = compute_hash_shift(&ei); if (memnode_shift < 0) { memnode_shift = 0; printk(KERN_ERR "No NUMA hash function found. NUMA emulation " @@ -797,9 +804,7 @@ void __init initmem_init(void) nodes_clear(mem_nodes_parsed); nodes_clear(node_possible_map); nodes_clear(node_online_map); - num_node_memblks = 0; - memset(node_memblk_range, 0, sizeof(node_memblk_range)); - memset(memblk_nodeid, 0, sizeof(memblk_nodeid)); + memset(&numa_meminfo, 0, sizeof(numa_meminfo)); memset(numa_nodes, 0, sizeof(numa_nodes)); remove_all_active_ranges(); From f9c60251c3d08777db6758cafd959a55a838abd6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 17:11:09 +0100 Subject: [PATCH 276/706] x86-64, NUMA: Separate out numa_cleanup_meminfo() Separate out numa_cleanup_meminfo() from numa_register_memblks(). node_possible_map initialization is moved to the top of the split numa_register_memblks(). This patch doesn't cause behavior change. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/mm/numa_64.c | 83 ++++++++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 37 deletions(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index c3496e2b5a71..f2721de30a43 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -292,40 +292,8 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) node_set_online(nodeid); } -/* - * Sanity check to catch more bad NUMA configurations (they are amazingly - * common). Make sure the nodes cover all memory. - */ -static int __init nodes_cover_memory(const struct bootnode *nodes) +static int __init numa_cleanup_meminfo(struct numa_meminfo *mi) { - unsigned long numaram, e820ram; - int i; - - numaram = 0; - for_each_node_mask(i, mem_nodes_parsed) { - unsigned long s = nodes[i].start >> PAGE_SHIFT; - unsigned long e = nodes[i].end >> PAGE_SHIFT; - numaram += e - s; - numaram -= __absent_pages_in_range(i, s, e); - if ((long)numaram < 0) - numaram = 0; - } - - e820ram = max_pfn - - (memblock_x86_hole_size(0, max_pfn<> PAGE_SHIFT); - /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ - if ((long)(e820ram - numaram) >= (1<<(20 - PAGE_SHIFT))) { - printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n", - (numaram << PAGE_SHIFT) >> 20, - (e820ram << PAGE_SHIFT) >> 20); - return 0; - } - return 1; -} - -static int __init numa_register_memblks(void) -{ - struct numa_meminfo *mi = &numa_meminfo; int i; /* @@ -368,6 +336,49 @@ static int __init numa_register_memblks(void) } } + return 0; +} + +/* + * Sanity check to catch more bad NUMA configurations (they are amazingly + * common). Make sure the nodes cover all memory. + */ +static int __init nodes_cover_memory(const struct bootnode *nodes) +{ + unsigned long numaram, e820ram; + int i; + + numaram = 0; + for_each_node_mask(i, mem_nodes_parsed) { + unsigned long s = nodes[i].start >> PAGE_SHIFT; + unsigned long e = nodes[i].end >> PAGE_SHIFT; + numaram += e - s; + numaram -= __absent_pages_in_range(i, s, e); + if ((long)numaram < 0) + numaram = 0; + } + + e820ram = max_pfn - (memblock_x86_hole_size(0, + max_pfn << PAGE_SHIFT) >> PAGE_SHIFT); + /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ + if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { + printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n", + (numaram << PAGE_SHIFT) >> 20, + (e820ram << PAGE_SHIFT) >> 20); + return 0; + } + return 1; +} + +static int __init numa_register_memblks(struct numa_meminfo *mi) +{ + int i; + + /* Account for nodes with cpus and no memory */ + nodes_or(node_possible_map, mem_nodes_parsed, cpu_nodes_parsed); + if (WARN_ON(nodes_empty(node_possible_map))) + return -EINVAL; + memnode_shift = compute_hash_shift(mi); if (memnode_shift < 0) { printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n"); @@ -823,12 +834,10 @@ void __init initmem_init(void) nodes_clear(node_possible_map); nodes_clear(node_online_map); #endif - /* Account for nodes with cpus and no memory */ - nodes_or(node_possible_map, mem_nodes_parsed, cpu_nodes_parsed); - if (WARN_ON(nodes_empty(node_possible_map))) + if (numa_cleanup_meminfo(&numa_meminfo) < 0) continue; - if (numa_register_memblks() < 0) + if (numa_register_memblks(&numa_meminfo) < 0) continue; for (j = 0; j < nr_cpu_ids; j++) { From 2e756be44714d0ec2f9827e4f4797c60876167a1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 17:11:09 +0100 Subject: [PATCH 277/706] x86-64, NUMA: make numa_cleanup_meminfo() prettier * Factor out numa_remove_memblk_from(). * Hole detection doesn't need separate start/end. Calculate start/end once. * Relocate comment. * Define iterators at the top and remove unnecessary prefix increments. This prepares for further improvements to the function. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/mm/numa_64.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index f2721de30a43..4fd3368adc8f 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -230,6 +230,13 @@ int __init numa_add_memblk(int nid, u64 start, u64 end) return 0; } +static void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) +{ + mi->nr_blks--; + memmove(&mi->blk[idx], &mi->blk[idx + 1], + (mi->nr_blks - idx) * sizeof(mi->blk[0])); +} + static __init void cutoff_node(int i, unsigned long start, unsigned long end) { struct bootnode *nd = &numa_nodes[i]; @@ -294,25 +301,25 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) static int __init numa_cleanup_meminfo(struct numa_meminfo *mi) { - int i; + int i, j, k; - /* - * Join together blocks on the same node, holes between - * which don't overlap with memory on other nodes. - */ - for (i = 0; i < mi->nr_blks; ++i) { + for (i = 0; i < mi->nr_blks; i++) { struct numa_memblk *bi = &mi->blk[i]; - int j, k; - for (j = i + 1; j < mi->nr_blks; ++j) { + for (j = i + 1; j < mi->nr_blks; j++) { struct numa_memblk *bj = &mi->blk[j]; unsigned long start, end; + /* + * Join together blocks on the same node, holes + * between which don't overlap with memory on other + * nodes. + */ if (bi->nid != bj->nid) continue; - start = min(bi->end, bj->end); - end = max(bi->start, bj->start); - for (k = 0; k < mi->nr_blks; ++k) { + start = min(bi->start, bj->start); + end = max(bi->end, bj->end); + for (k = 0; k < mi->nr_blks; k++) { struct numa_memblk *bk = &mi->blk[k]; if (bi->nid == bk->nid) @@ -322,17 +329,12 @@ static int __init numa_cleanup_meminfo(struct numa_meminfo *mi) } if (k < mi->nr_blks) continue; - start = min(bi->start, bj->start); - end = max(bi->end, bj->end); printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n", bi->nid, bi->start, bi->end, bj->start, bj->end, start, end); bi->start = start; bi->end = end; - k = --mi->nr_blks - j; - memmove(mi->blk + j, mi->blk + j + 1, - k * sizeof(mi->blk[0])); - --j; + numa_remove_memblk_from(j--, mi); } } From 56e827fbde9a3cb886a2fe138db0d99e98efbfb1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 17:11:09 +0100 Subject: [PATCH 278/706] x86-64, NUMA: consolidate and improve memblk sanity checks memblk sanity check was scattered around and incomplete. Consolidate and improve. * Confliction detection and cutoff_node() logic are moved to numa_cleanup_meminfo(). * numa_cleanup_meminfo() clears the unused memblks before returning. * Check and warn about invalid input parameters in numa_add_memblk(). * Check the maximum number of memblk isn't exceeded in numa_add_memblk(). * numa_cleanup_meminfo() is now called before numa_emulation() so that the emulation code also uses the cleaned up version. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/mm/numa_64.c | 103 +++++++++++++++++++++--------------------- 1 file changed, 51 insertions(+), 52 deletions(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 4fd3368adc8f..20aa1d31e165 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -189,37 +189,23 @@ static void * __init early_node_mem(int nodeid, unsigned long start, return NULL; } -static __init int conflicting_memblks(unsigned long start, unsigned long end) -{ - struct numa_meminfo *mi = &numa_meminfo; - int i; - - for (i = 0; i < mi->nr_blks; i++) { - struct numa_memblk *blk = &mi->blk[i]; - - if (blk->start == blk->end) - continue; - if (blk->end > start && blk->start < end) - return blk->nid; - if (blk->end == end && blk->start == start) - return blk->nid; - } - return -1; -} - int __init numa_add_memblk(int nid, u64 start, u64 end) { struct numa_meminfo *mi = &numa_meminfo; - int i; - i = conflicting_memblks(start, end); - if (i == nid) { - printk(KERN_WARNING "NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n", - nid, start, end, numa_nodes[i].start, numa_nodes[i].end); - } else if (i >= 0) { - printk(KERN_ERR "NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n", - nid, start, end, i, - numa_nodes[i].start, numa_nodes[i].end); + /* ignore zero length blks */ + if (start == end) + return 0; + + /* whine about and ignore invalid blks */ + if (start > end || nid < 0 || nid >= MAX_NUMNODES) { + pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n", + nid, start, end); + return 0; + } + + if (mi->nr_blks >= NR_NODE_MEMBLKS) { + pr_err("NUMA: too many memblk ranges\n"); return -EINVAL; } @@ -237,22 +223,6 @@ static void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) (mi->nr_blks - idx) * sizeof(mi->blk[0])); } -static __init void cutoff_node(int i, unsigned long start, unsigned long end) -{ - struct bootnode *nd = &numa_nodes[i]; - - if (nd->start < start) { - nd->start = start; - if (nd->end < nd->start) - nd->start = nd->end; - } - if (nd->end > end) { - nd->end = end; - if (nd->start > nd->end) - nd->start = nd->end; - } -} - /* Initialize bootmem allocator for a node */ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) @@ -301,15 +271,44 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) static int __init numa_cleanup_meminfo(struct numa_meminfo *mi) { + const u64 low = 0; + const u64 high = (u64)max_pfn << PAGE_SHIFT; int i, j, k; for (i = 0; i < mi->nr_blks; i++) { struct numa_memblk *bi = &mi->blk[i]; + /* make sure all blocks are inside the limits */ + bi->start = max(bi->start, low); + bi->end = min(bi->end, high); + + /* and there's no empty block */ + if (bi->start == bi->end) { + numa_remove_memblk_from(i--, mi); + continue; + } + for (j = i + 1; j < mi->nr_blks; j++) { struct numa_memblk *bj = &mi->blk[j]; unsigned long start, end; + /* + * See whether there are overlapping blocks. Whine + * about but allow overlaps of the same nid. They + * will be merged below. + */ + if (bi->end > bj->start && bi->start < bj->end) { + if (bi->nid != bj->nid) { + pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n", + bi->nid, bi->start, bi->end, + bj->nid, bj->start, bj->end); + return -EINVAL; + } + pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n", + bi->nid, bi->start, bi->end, + bj->start, bj->end); + } + /* * Join together blocks on the same node, holes * between which don't overlap with memory on other @@ -317,8 +316,8 @@ static int __init numa_cleanup_meminfo(struct numa_meminfo *mi) */ if (bi->nid != bj->nid) continue; - start = min(bi->start, bj->start); - end = max(bi->end, bj->end); + start = max(min(bi->start, bj->start), low); + end = min(max(bi->end, bj->end), high); for (k = 0; k < mi->nr_blks; k++) { struct numa_memblk *bk = &mi->blk[k]; @@ -338,6 +337,11 @@ static int __init numa_cleanup_meminfo(struct numa_meminfo *mi) } } + for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { + mi->blk[i].start = mi->blk[i].end = 0; + mi->blk[i].nid = NUMA_NO_NODE; + } + return 0; } @@ -824,10 +828,8 @@ void __init initmem_init(void) if (numa_init[i]() < 0) continue; - /* clean up the node list */ - for (j = 0; j < MAX_NUMNODES; j++) - cutoff_node(j, 0, max_pfn << PAGE_SHIFT); - + if (numa_cleanup_meminfo(&numa_meminfo) < 0) + continue; #ifdef CONFIG_NUMA_EMU setup_physnodes(0, max_pfn << PAGE_SHIFT); if (cmdline && !numa_emulation(0, max_pfn, i == 0, i == 1)) @@ -836,9 +838,6 @@ void __init initmem_init(void) nodes_clear(node_possible_map); nodes_clear(node_online_map); #endif - if (numa_cleanup_meminfo(&numa_meminfo) < 0) - continue; - if (numa_register_memblks(&numa_meminfo) < 0) continue; From a844ef46fa3055165c28feede6114a711b8375ad Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 17:11:09 +0100 Subject: [PATCH 279/706] x86-64, NUMA: Add common find_node_by_addr() srat_64.c and amdtopology_64.c had their own versions of find_node_by_addr() which were basically the same. Add common one in numa_64.c and remove the duplicates. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/include/asm/numa_64.h | 1 + arch/x86/mm/amdtopology_64.c | 13 ------------- arch/x86/mm/numa_64.c | 19 +++++++++++++++++++ arch/x86/mm/srat_64.c | 18 ------------------ 4 files changed, 20 insertions(+), 31 deletions(-) diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index e925605150a0..925ade9d67e4 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -35,6 +35,7 @@ extern int __init numa_add_memblk(int nodeid, u64 start, u64 end); #define FAKE_NODE_MIN_SIZE ((u64)32 << 20) #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) void numa_emu_cmdline(char *); +int __init find_node_by_addr(unsigned long addr); #endif /* CONFIG_NUMA_EMU */ #else static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; } diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c index 90cf297b3b59..8f7a5eb4bd3c 100644 --- a/arch/x86/mm/amdtopology_64.c +++ b/arch/x86/mm/amdtopology_64.c @@ -205,19 +205,6 @@ static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; -static int __init find_node_by_addr(unsigned long addr) -{ - int ret = NUMA_NO_NODE; - int i; - - for (i = 0; i < 8; i++) - if (addr >= numa_nodes[i].start && addr < numa_nodes[i].end) { - ret = i; - break; - } - return ret; -} - /* * For NUMA emulation, fake proximity domain (_PXM) to node id mappings must be * setup to represent the physical topology but reflect the emulated diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 20aa1d31e165..681bc0d59db5 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -430,6 +430,25 @@ void __init numa_emu_cmdline(char *str) cmdline = str; } +int __init find_node_by_addr(unsigned long addr) +{ + int ret = NUMA_NO_NODE; + int i; + + for_each_node_mask(i, mem_nodes_parsed) { + /* + * Find the real node that this emulated node appears on. For + * the sake of simplicity, we only use a real node's starting + * address to determine which emulated node it appears on. + */ + if (addr >= numa_nodes[i].start && addr < numa_nodes[i].end) { + ret = i; + break; + } + } + return ret; +} + static int __init setup_physnodes(unsigned long start, unsigned long end) { int ret = 0; diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index d56eff811cd8..51d07338d2e4 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -277,24 +277,6 @@ static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = { static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; -static int __init find_node_by_addr(unsigned long addr) -{ - int ret = NUMA_NO_NODE; - int i; - - for_each_node_mask(i, mem_nodes_parsed) { - /* - * Find the real node that this emulated node appears on. For - * the sake of simplicity, we only use a real node's starting - * address to determine which emulated node it appears on. - */ - if (addr >= numa_nodes[i].start && addr < numa_nodes[i].end) { - ret = i; - break; - } - } - return ret; -} /* * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID From 91556237ec872e1029e3036174bae3b1a8df65eb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 17:11:09 +0100 Subject: [PATCH 280/706] x86-64, NUMA: Kill numa_nodes[] numa_nodes[] doesn't carry any information which isn't present in numa_meminfo. Each entry is simply min/max range of all the memblks for the node. This is not only redundant but also inaccurate when memblks for different nodes interleave - for example, find_node_by_addr() can return the wrong nodeid. Kill numa_nodes[] and always use numa_meminfo instead. * nodes_cover_memory() is renamed to numa_meminfo_cover_memory() and now operations on numa_meminfo and returns bool. * setup_node_bootmem() needs min/max range. Compute the range on the fly. setup_node_bootmem() invocation is restructured to use outer loop instead of hardcoding the double invocations. * find_node_by_addr() now operates on numa_meminfo. * setup_physnodes() builds physnodes[] from memblks. This will go away when emulation code is updated to use struct numa_meminfo. This patch also makes the following misc changes. * Clearing of nodes_add[] clearing is converted to memset(). * numa_add_memblk() in amd_numa_init() is moved down a bit for consistency. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/include/asm/numa_64.h | 1 - arch/x86/mm/amdtopology_64.c | 6 +-- arch/x86/mm/numa_64.c | 82 ++++++++++++++++++++-------------- arch/x86/mm/srat_64.c | 22 ++------- 4 files changed, 53 insertions(+), 58 deletions(-) diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 925ade9d67e4..20b69a98f37d 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -26,7 +26,6 @@ extern void setup_node_bootmem(int nodeid, unsigned long start, extern nodemask_t cpu_nodes_parsed __initdata; extern nodemask_t mem_nodes_parsed __initdata; -extern struct bootnode numa_nodes[MAX_NUMNODES] __initdata; extern int __cpuinit numa_cpu_node(int cpu); extern int __init numa_add_memblk(int nodeid, u64 start, u64 end); diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c index 8f7a5eb4bd3c..0cb59e582007 100644 --- a/arch/x86/mm/amdtopology_64.c +++ b/arch/x86/mm/amdtopology_64.c @@ -165,12 +165,8 @@ int __init amd_numa_init(void) pr_info("Node %d MemBase %016lx Limit %016lx\n", nodeid, base, limit); - numa_nodes[nodeid].start = base; - numa_nodes[nodeid].end = limit; - numa_add_memblk(nodeid, base, limit); - prevbase = base; - + numa_add_memblk(nodeid, base, limit); node_set(nodeid, mem_nodes_parsed); node_set(nodeid, cpu_nodes_parsed); } diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 681bc0d59db5..c490448d716a 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -46,8 +46,6 @@ static unsigned long __initdata nodemap_size; static struct numa_meminfo numa_meminfo __initdata; -struct bootnode numa_nodes[MAX_NUMNODES] __initdata; - /* * Given a shift value, try to populate memnodemap[] * Returns : @@ -349,17 +347,17 @@ static int __init numa_cleanup_meminfo(struct numa_meminfo *mi) * Sanity check to catch more bad NUMA configurations (they are amazingly * common). Make sure the nodes cover all memory. */ -static int __init nodes_cover_memory(const struct bootnode *nodes) +static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) { unsigned long numaram, e820ram; int i; numaram = 0; - for_each_node_mask(i, mem_nodes_parsed) { - unsigned long s = nodes[i].start >> PAGE_SHIFT; - unsigned long e = nodes[i].end >> PAGE_SHIFT; + for (i = 0; i < mi->nr_blks; i++) { + unsigned long s = mi->blk[i].start >> PAGE_SHIFT; + unsigned long e = mi->blk[i].end >> PAGE_SHIFT; numaram += e - s; - numaram -= __absent_pages_in_range(i, s, e); + numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e); if ((long)numaram < 0) numaram = 0; } @@ -371,14 +369,14 @@ static int __init nodes_cover_memory(const struct bootnode *nodes) printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n", (numaram << PAGE_SHIFT) >> 20, (e820ram << PAGE_SHIFT) >> 20); - return 0; + return false; } - return 1; + return true; } static int __init numa_register_memblks(struct numa_meminfo *mi) { - int i; + int i, j, nid; /* Account for nodes with cpus and no memory */ nodes_or(node_possible_map, mem_nodes_parsed, cpu_nodes_parsed); @@ -398,23 +396,34 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) /* for out of order entries */ sort_node_map(); - if (!nodes_cover_memory(numa_nodes)) + if (!numa_meminfo_cover_memory(mi)) return -EINVAL; init_memory_mapping_high(); - /* Finally register nodes. */ - for_each_node_mask(i, node_possible_map) - setup_node_bootmem(i, numa_nodes[i].start, numa_nodes[i].end); - /* - * Try again in case setup_node_bootmem missed one due to missing - * bootmem. + * Finally register nodes. Do it twice in case setup_node_bootmem + * missed one due to missing bootmem. */ - for_each_node_mask(i, node_possible_map) - if (!node_online(i)) - setup_node_bootmem(i, numa_nodes[i].start, - numa_nodes[i].end); + for (i = 0; i < 2; i++) { + for_each_node_mask(nid, node_possible_map) { + u64 start = (u64)max_pfn << PAGE_SHIFT; + u64 end = 0; + + if (node_online(nid)) + continue; + + for (j = 0; j < mi->nr_blks; j++) { + if (nid != mi->blk[j].nid) + continue; + start = min(mi->blk[j].start, start); + end = max(mi->blk[j].end, end); + } + + if (start < end) + setup_node_bootmem(nid, start, end); + } + } return 0; } @@ -432,33 +441,41 @@ void __init numa_emu_cmdline(char *str) int __init find_node_by_addr(unsigned long addr) { - int ret = NUMA_NO_NODE; + const struct numa_meminfo *mi = &numa_meminfo; int i; - for_each_node_mask(i, mem_nodes_parsed) { + for (i = 0; i < mi->nr_blks; i++) { /* * Find the real node that this emulated node appears on. For * the sake of simplicity, we only use a real node's starting * address to determine which emulated node it appears on. */ - if (addr >= numa_nodes[i].start && addr < numa_nodes[i].end) { - ret = i; - break; - } + if (addr >= mi->blk[i].start && addr < mi->blk[i].end) + return mi->blk[i].nid; } - return ret; + return NUMA_NO_NODE; } static int __init setup_physnodes(unsigned long start, unsigned long end) { + const struct numa_meminfo *mi = &numa_meminfo; int ret = 0; int i; memset(physnodes, 0, sizeof(physnodes)); - for_each_node_mask(i, mem_nodes_parsed) { - physnodes[i].start = numa_nodes[i].start; - physnodes[i].end = numa_nodes[i].end; + for (i = 0; i < mi->nr_blks; i++) { + int nid = mi->blk[i].nid; + + if (physnodes[nid].start == physnodes[nid].end) { + physnodes[nid].start = mi->blk[i].start; + physnodes[nid].end = mi->blk[i].end; + } else { + physnodes[nid].start = min(physnodes[nid].start, + mi->blk[i].start); + physnodes[nid].end = max(physnodes[nid].end, + mi->blk[i].end); + } } /* @@ -809,8 +826,6 @@ static int dummy_numa_init(void) node_set(0, cpu_nodes_parsed); node_set(0, mem_nodes_parsed); numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT); - numa_nodes[0].start = 0; - numa_nodes[0].end = (u64)max_pfn << PAGE_SHIFT; return 0; } @@ -841,7 +856,6 @@ void __init initmem_init(void) nodes_clear(node_possible_map); nodes_clear(node_online_map); memset(&numa_meminfo, 0, sizeof(numa_meminfo)); - memset(numa_nodes, 0, sizeof(numa_nodes)); remove_all_active_ranges(); if (numa_init[i]() < 0) diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 51d07338d2e4..e8b3b3cb2c2b 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -37,13 +37,9 @@ static __init int setup_node(int pxm) static __init void bad_srat(void) { - int i; printk(KERN_ERR "SRAT: SRAT not used.\n"); acpi_numa = -1; - for (i = 0; i < MAX_NUMNODES; i++) { - numa_nodes[i].start = numa_nodes[i].end = 0; - nodes_add[i].start = nodes_add[i].end = 0; - } + memset(nodes_add, 0, sizeof(nodes_add)); } static __init inline int srat_disabled(void) @@ -210,7 +206,6 @@ update_nodes_add(int node, unsigned long start, unsigned long end) void __init acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) { - struct bootnode *nd; unsigned long start, end; int node, pxm; @@ -243,18 +238,9 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, start, end); - if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)) { - nd = &numa_nodes[node]; - if (!node_test_and_set(node, mem_nodes_parsed)) { - nd->start = start; - nd->end = end; - } else { - if (start < nd->start) - nd->start = start; - if (nd->end < end) - nd->end = end; - } - } else + if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)) + node_set(node, mem_nodes_parsed); + else update_nodes_add(node, start, end); } From 92d4a4371eeb89e1e12b9ebbed0956f499b6c2c0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 17:11:09 +0100 Subject: [PATCH 281/706] x86-64, NUMA: Rename cpu_nodes_parsed to numa_nodes_parsed It's no longer necessary to keep both cpu_nodes_parsed and mem_nodes_parsed. In preparation for merge, rename cpu_nodes_parsed to numa_nodes_parsed. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/include/asm/numa_64.h | 2 +- arch/x86/mm/amdtopology_64.c | 4 ++-- arch/x86/mm/numa_64.c | 8 ++++---- arch/x86/mm/srat_64.c | 6 +++--- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 20b69a98f37d..6e944696144a 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -24,7 +24,7 @@ extern void setup_node_bootmem(int nodeid, unsigned long start, */ #define NODE_MIN_SIZE (4*1024*1024) -extern nodemask_t cpu_nodes_parsed __initdata; +extern nodemask_t numa_nodes_parsed __initdata; extern nodemask_t mem_nodes_parsed __initdata; extern int __cpuinit numa_cpu_node(int cpu); diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c index 0cb59e582007..e76bffabc09d 100644 --- a/arch/x86/mm/amdtopology_64.c +++ b/arch/x86/mm/amdtopology_64.c @@ -168,7 +168,7 @@ int __init amd_numa_init(void) prevbase = base; numa_add_memblk(nodeid, base, limit); node_set(nodeid, mem_nodes_parsed); - node_set(nodeid, cpu_nodes_parsed); + node_set(nodeid, numa_nodes_parsed); } if (!nodes_weight(mem_nodes_parsed)) @@ -189,7 +189,7 @@ int __init amd_numa_init(void) apicid_base = boot_cpu_physical_apicid; } - for_each_node_mask(i, cpu_nodes_parsed) + for_each_node_mask(i, numa_nodes_parsed) for (j = apicid_base; j < cores + apicid_base; j++) set_apicid_to_node((i << bits) + j, i); diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index c490448d716a..6e4fbd777564 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -36,7 +36,7 @@ struct numa_meminfo { struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); -nodemask_t cpu_nodes_parsed __initdata; +nodemask_t numa_nodes_parsed __initdata; nodemask_t mem_nodes_parsed __initdata; struct memnode memnode; @@ -379,7 +379,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) int i, j, nid; /* Account for nodes with cpus and no memory */ - nodes_or(node_possible_map, mem_nodes_parsed, cpu_nodes_parsed); + nodes_or(node_possible_map, mem_nodes_parsed, numa_nodes_parsed); if (WARN_ON(nodes_empty(node_possible_map))) return -EINVAL; @@ -823,7 +823,7 @@ static int dummy_numa_init(void) printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 0LU, max_pfn << PAGE_SHIFT); - node_set(0, cpu_nodes_parsed); + node_set(0, numa_nodes_parsed); node_set(0, mem_nodes_parsed); numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT); @@ -851,7 +851,7 @@ void __init initmem_init(void) for (j = 0; j < MAX_LOCAL_APIC; j++) set_apicid_to_node(j, NUMA_NO_NODE); - nodes_clear(cpu_nodes_parsed); + nodes_clear(numa_nodes_parsed); nodes_clear(mem_nodes_parsed); nodes_clear(node_possible_map); nodes_clear(node_online_map); diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index e8b3b3cb2c2b..8185189d34a2 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -94,7 +94,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) return; } set_apicid_to_node(apic_id, node); - node_set(node, cpu_nodes_parsed); + node_set(node, numa_nodes_parsed); acpi_numa = 1; printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", pxm, apic_id, node); @@ -134,7 +134,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) } set_apicid_to_node(apic_id, node); - node_set(node, cpu_nodes_parsed); + node_set(node, numa_nodes_parsed); acpi_numa = 1; printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", pxm, apic_id, node); @@ -196,7 +196,7 @@ update_nodes_add(int node, unsigned long start, unsigned long end) } if (changed) { - node_set(node, cpu_nodes_parsed); + node_set(node, numa_nodes_parsed); printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end); } From 4697bdcc945c094d2c8a4876a24faeaf31a283e0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 17:11:09 +0100 Subject: [PATCH 282/706] x86-64, NUMA: Kill mem_nodes_parsed With all memory configuration information now carried in numa_meminfo, there's no need to keep mem_nodes_parsed separate. Drop it and use numa_nodes_parsed for CPU / memory-less nodes. A new helper numa_nodemask_from_meminfo() is added to calculate memnode mask on the fly which is currently used to set node_possible_map. This simplifies NUMA init methods a bit and removes a source of possible inconsistencies. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/include/asm/numa_64.h | 1 - arch/x86/mm/amdtopology_64.c | 5 ++--- arch/x86/mm/numa_64.c | 20 ++++++++++++++++---- arch/x86/mm/srat_64.c | 7 ++----- 4 files changed, 20 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 6e944696144a..f42710bd3e73 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -25,7 +25,6 @@ extern void setup_node_bootmem(int nodeid, unsigned long start, #define NODE_MIN_SIZE (4*1024*1024) extern nodemask_t numa_nodes_parsed __initdata; -extern nodemask_t mem_nodes_parsed __initdata; extern int __cpuinit numa_cpu_node(int cpu); extern int __init numa_add_memblk(int nodeid, u64 start, u64 end); diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c index e76bffabc09d..fd7b609025ba 100644 --- a/arch/x86/mm/amdtopology_64.c +++ b/arch/x86/mm/amdtopology_64.c @@ -122,7 +122,7 @@ int __init amd_numa_init(void) nodeid, (base >> 8) & 3, (limit >> 8) & 3); return -EINVAL; } - if (node_isset(nodeid, mem_nodes_parsed)) { + if (node_isset(nodeid, numa_nodes_parsed)) { pr_info("Node %d already present, skipping\n", nodeid); continue; @@ -167,11 +167,10 @@ int __init amd_numa_init(void) prevbase = base; numa_add_memblk(nodeid, base, limit); - node_set(nodeid, mem_nodes_parsed); node_set(nodeid, numa_nodes_parsed); } - if (!nodes_weight(mem_nodes_parsed)) + if (!nodes_weight(numa_nodes_parsed)) return -ENOENT; /* diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 6e4fbd777564..8b1f178a866e 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -37,7 +37,6 @@ struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); nodemask_t numa_nodes_parsed __initdata; -nodemask_t mem_nodes_parsed __initdata; struct memnode memnode; @@ -343,6 +342,20 @@ static int __init numa_cleanup_meminfo(struct numa_meminfo *mi) return 0; } +/* + * Set nodes, which have memory in @mi, in *@nodemask. + */ +static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, + const struct numa_meminfo *mi) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(mi->blk); i++) + if (mi->blk[i].start != mi->blk[i].end && + mi->blk[i].nid != NUMA_NO_NODE) + node_set(mi->blk[i].nid, *nodemask); +} + /* * Sanity check to catch more bad NUMA configurations (they are amazingly * common). Make sure the nodes cover all memory. @@ -379,7 +392,8 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) int i, j, nid; /* Account for nodes with cpus and no memory */ - nodes_or(node_possible_map, mem_nodes_parsed, numa_nodes_parsed); + node_possible_map = numa_nodes_parsed; + numa_nodemask_from_meminfo(&node_possible_map, mi); if (WARN_ON(nodes_empty(node_possible_map))) return -EINVAL; @@ -824,7 +838,6 @@ static int dummy_numa_init(void) 0LU, max_pfn << PAGE_SHIFT); node_set(0, numa_nodes_parsed); - node_set(0, mem_nodes_parsed); numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT); return 0; @@ -852,7 +865,6 @@ void __init initmem_init(void) set_apicid_to_node(j, NUMA_NO_NODE); nodes_clear(numa_nodes_parsed); - nodes_clear(mem_nodes_parsed); nodes_clear(node_possible_map); nodes_clear(node_online_map); memset(&numa_meminfo, 0, sizeof(numa_meminfo)); diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 8185189d34a2..4f8e6cde9bf6 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -238,9 +238,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, start, end); - if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)) - node_set(node, mem_nodes_parsed); - else + if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) update_nodes_add(node, start, end); } @@ -310,10 +308,9 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i); memcpy(__apicid_to_node, fake_apicid_to_node, sizeof(__apicid_to_node)); - nodes_clear(mem_nodes_parsed); for (i = 0; i < num_nodes; i++) if (fake_nodes[i].start != fake_nodes[i].end) - node_set(i, mem_nodes_parsed); + node_set(i, numa_nodes_parsed); } static int null_slit_node_compare(int a, int b) From ac7136b611ee8f8bd6231ce2e1dbdd31ae3d39bc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 17:11:09 +0100 Subject: [PATCH 283/706] x86-64, NUMA: Implement generic node distance handling Node distance either used direct node comparison, ACPI PXM comparison or ACPI SLIT table lookup. This patch implements generic node distance handling. NUMA init methods can call numa_set_distance() to set distance between nodes and the common __node_distance() implementation will report the set distance. Due to the way NUMA emulation is implemented, the generic node distance handling is used only when emulation is not used. Later patches will update NUMA emulation to use the generic distance mechanism. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/include/asm/acpi.h | 1 + arch/x86/include/asm/numa_64.h | 1 + arch/x86/include/asm/topology.h | 2 +- arch/x86/mm/numa_64.c | 95 +++++++++++++++++++++++++++++++++ arch/x86/mm/srat_64.c | 27 ++++------ 5 files changed, 109 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index cfa3d5c3144a..9c9fe1b0bc4e 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -190,6 +190,7 @@ extern int x86_acpi_numa_init(void); #ifdef CONFIG_NUMA_EMU extern void acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes); +extern int acpi_emu_node_distance(int a, int b); #endif #endif /* CONFIG_ACPI_NUMA */ diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index f42710bd3e73..5361c5947986 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -28,6 +28,7 @@ extern nodemask_t numa_nodes_parsed __initdata; extern int __cpuinit numa_cpu_node(int cpu); extern int __init numa_add_memblk(int nodeid, u64 start, u64 end); +extern void __init numa_set_distance(int from, int to, int distance); #ifdef CONFIG_NUMA_EMU #define FAKE_NODE_MIN_SIZE ((u64)32 << 20) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index b101c17861f5..910a7084f7f2 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -138,7 +138,7 @@ extern unsigned long node_remap_size[]; .balance_interval = 1, \ } -#ifdef CONFIG_X86_64_ACPI_NUMA +#ifdef CONFIG_X86_64 extern int __node_distance(int, int); #define node_distance(a, b) __node_distance(a, b) #endif diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 8b1f178a866e..a3621f2953d6 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -45,6 +45,13 @@ static unsigned long __initdata nodemap_size; static struct numa_meminfo numa_meminfo __initdata; +static int numa_distance_cnt; +static u8 *numa_distance; + +#ifdef CONFIG_NUMA_EMU +static bool numa_emu_dist; +#endif + /* * Given a shift value, try to populate memnodemap[] * Returns : @@ -356,6 +363,92 @@ static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, node_set(mi->blk[i].nid, *nodemask); } +/* + * Reset distance table. The current table is freed. The next + * numa_set_distance() call will create a new one. + */ +static void __init numa_reset_distance(void) +{ + size_t size; + + size = numa_distance_cnt * sizeof(numa_distance[0]); + memblock_x86_free_range(__pa(numa_distance), + __pa(numa_distance) + size); + numa_distance = NULL; + numa_distance_cnt = 0; +} + +/* + * Set the distance between node @from to @to to @distance. If distance + * table doesn't exist, one which is large enough to accomodate all the + * currently known nodes will be created. + */ +void __init numa_set_distance(int from, int to, int distance) +{ + if (!numa_distance) { + nodemask_t nodes_parsed; + size_t size; + int i, j, cnt = 0; + u64 phys; + + /* size the new table and allocate it */ + nodes_parsed = numa_nodes_parsed; + numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); + + for_each_node_mask(i, nodes_parsed) + cnt = i; + size = ++cnt * sizeof(numa_distance[0]); + + phys = memblock_find_in_range(0, + (u64)max_pfn_mapped << PAGE_SHIFT, + size, PAGE_SIZE); + if (phys == MEMBLOCK_ERROR) { + pr_warning("NUMA: Warning: can't allocate distance table!\n"); + /* don't retry until explicitly reset */ + numa_distance = (void *)1LU; + return; + } + memblock_x86_reserve_range(phys, phys + size, "NUMA DIST"); + + numa_distance = __va(phys); + numa_distance_cnt = cnt; + + /* fill with the default distances */ + for (i = 0; i < cnt; i++) + for (j = 0; j < cnt; j++) + numa_distance[i * cnt + j] = i == j ? + LOCAL_DISTANCE : REMOTE_DISTANCE; + printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); + } + + if (from >= numa_distance_cnt || to >= numa_distance_cnt) { + printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n", + from, to, distance); + return; + } + + if ((u8)distance != distance || + (from == to && distance != LOCAL_DISTANCE)) { + pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n", + from, to, distance); + return; + } + + numa_distance[from * numa_distance_cnt + to] = distance; +} + +int __node_distance(int from, int to) +{ +#if defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA_EMU) + if (numa_emu_dist) + return acpi_emu_node_distance(from, to); +#endif + if (from >= numa_distance_cnt || to >= numa_distance_cnt) + return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; + return numa_distance[from * numa_distance_cnt + to]; +} +EXPORT_SYMBOL(__node_distance); + /* * Sanity check to catch more bad NUMA configurations (they are amazingly * common). Make sure the nodes cover all memory. @@ -826,6 +919,7 @@ static int __init numa_emulation(unsigned long start_pfn, setup_physnodes(addr, max_addr); fake_physnodes(acpi, amd, num_nodes); numa_init_array(); + numa_emu_dist = true; return 0; } #endif /* CONFIG_NUMA_EMU */ @@ -869,6 +963,7 @@ void __init initmem_init(void) nodes_clear(node_online_map); memset(&numa_meminfo, 0, sizeof(numa_meminfo)); remove_all_active_ranges(); + numa_reset_distance(); if (numa_init[i]() < 0) continue; diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 4f8e6cde9bf6..d2f53f35d86a 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -50,9 +50,16 @@ static __init inline int srat_disabled(void) /* Callback for SLIT parsing */ void __init acpi_numa_slit_init(struct acpi_table_slit *slit) { + int i, j; unsigned length; unsigned long phys; + for (i = 0; i < slit->locality_count; i++) + for (j = 0; j < slit->locality_count; j++) + numa_set_distance(pxm_to_node(i), pxm_to_node(j), + slit->entry[slit->locality_count * i + j]); + + /* acpi_slit is used only by emulation */ length = slit->header.length; phys = memblock_find_in_range(0, max_pfn_mapped<locality_count * node_to_pxm(a); return acpi_slit->entry[index + node_to_pxm(b)]; } - -EXPORT_SYMBOL(__node_distance); +#endif /* CONFIG_NUMA_EMU */ #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY) int memory_add_physaddr_to_nid(u64 start) From d9c515eacb3bde73f7a5ecb7e35ea6e660ad421d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 17:11:10 +0100 Subject: [PATCH 284/706] x86-64, NUMA: Trivial changes to prepare for emulation updates * Separate out numa_add_memblk_to() from numa_add_memblk() so that different numa_meminfo can be used. * Rename cmdline to emu_cmdline. * Drop @start/last_pfn from numa_emulation() and use max_pfn directly. This patch doesn't introduce any behavior change. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/mm/numa_64.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index a3621f2953d6..20e2cfe5ab82 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -193,10 +193,9 @@ static void * __init early_node_mem(int nodeid, unsigned long start, return NULL; } -int __init numa_add_memblk(int nid, u64 start, u64 end) +static int __init numa_add_memblk_to(int nid, u64 start, u64 end, + struct numa_meminfo *mi) { - struct numa_meminfo *mi = &numa_meminfo; - /* ignore zero length blks */ if (start == end) return 0; @@ -227,6 +226,11 @@ static void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) (mi->nr_blks - idx) * sizeof(mi->blk[0])); } +int __init numa_add_memblk(int nid, u64 start, u64 end) +{ + return numa_add_memblk_to(nid, start, end, &numa_meminfo); +} + /* Initialize bootmem allocator for a node */ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) @@ -539,11 +543,11 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) /* Numa emulation */ static struct bootnode nodes[MAX_NUMNODES] __initdata; static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata; -static char *cmdline __initdata; +static char *emu_cmdline __initdata; void __init numa_emu_cmdline(char *str) { - cmdline = str; + emu_cmdline = str; } int __init find_node_by_addr(unsigned long addr) @@ -861,12 +865,10 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) * Sets up the system RAM area from start_pfn to last_pfn according to the * numa=fake command-line option. */ -static int __init numa_emulation(unsigned long start_pfn, - unsigned long last_pfn, int acpi, int amd) +static int __init numa_emulation(int acpi, int amd) { static struct numa_meminfo ei __initdata; - u64 addr = start_pfn << PAGE_SHIFT; - u64 max_addr = last_pfn << PAGE_SHIFT; + const u64 max_addr = max_pfn << PAGE_SHIFT; int num_nodes; int i; @@ -875,16 +877,16 @@ static int __init numa_emulation(unsigned long start_pfn, * the fixed node size. Otherwise, if it is just a single number N, * split the system RAM into N fake nodes. */ - if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) { + if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { u64 size; - size = memparse(cmdline, &cmdline); - num_nodes = split_nodes_size_interleave(addr, max_addr, size); + size = memparse(emu_cmdline, &emu_cmdline); + num_nodes = split_nodes_size_interleave(0, max_addr, size); } else { unsigned long n; - n = simple_strtoul(cmdline, NULL, 0); - num_nodes = split_nodes_interleave(addr, max_addr, n); + n = simple_strtoul(emu_cmdline, NULL, 0); + num_nodes = split_nodes_interleave(0, max_addr, n); } if (num_nodes < 0) @@ -916,7 +918,7 @@ static int __init numa_emulation(unsigned long start_pfn, init_memory_mapping_high(); for_each_node_mask(i, node_possible_map) setup_node_bootmem(i, nodes[i].start, nodes[i].end); - setup_physnodes(addr, max_addr); + setup_physnodes(0, max_addr); fake_physnodes(acpi, amd, num_nodes); numa_init_array(); numa_emu_dist = true; @@ -972,7 +974,7 @@ void __init initmem_init(void) continue; #ifdef CONFIG_NUMA_EMU setup_physnodes(0, max_pfn << PAGE_SHIFT); - if (cmdline && !numa_emulation(0, max_pfn, i == 0, i == 1)) + if (emu_cmdline && !numa_emulation(i == 0, i == 1)) return; setup_physnodes(0, max_pfn << PAGE_SHIFT); nodes_clear(node_possible_map); From 9d073caeb372940af02a768d2b7e845ac732bda0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 17:11:10 +0100 Subject: [PATCH 285/706] x86-64, NUMA: Build and use direct emulated nid -> phys nid mapping NUMA emulation copied physical NUMA configuration into physnodes[] and used it to reverse-map emulated nodes to physical nodes, which is unnecessarily convoluted. Build emu_nid_to_phys[] array to map emulated nids directly to the matching physical nids and use it in numa_add_cpu(). physnodes[] will be removed with further patches. - v2: Build failure when CONFIG_DEBUG_PER_CPU_MAPS due to missing local variable definition fixed. Reported by Ingo. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/mm/numa_64.c | 64 +++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 29 deletions(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 20e2cfe5ab82..e9919c4d1573 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -542,7 +542,9 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) #ifdef CONFIG_NUMA_EMU /* Numa emulation */ static struct bootnode nodes[MAX_NUMNODES] __initdata; -static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata; +static struct bootnode physnodes[MAX_NUMNODES] __initdata; + +static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata; static char *emu_cmdline __initdata; void __init numa_emu_cmdline(char *str) @@ -649,7 +651,8 @@ static void __init fake_physnodes(int acpi, int amd, int nr_nodes) * allocation past addr and -1 otherwise. addr is adjusted to be at * the end of the node. */ -static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr) +static int __init setup_node_range(int nid, int physnid, + u64 *addr, u64 size, u64 max_addr) { int ret = 0; nodes[nid].start = *addr; @@ -660,6 +663,10 @@ static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr) } nodes[nid].end = *addr; node_set(nid, node_possible_map); + + if (emu_nid_to_phys[nid] == NUMA_NO_NODE) + emu_nid_to_phys[nid] = physnid; + printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, nodes[nid].start, nodes[nid].end, (nodes[nid].end - nodes[nid].start) >> 20); @@ -756,7 +763,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes) if (nodes_weight(physnode_mask) + ret >= nr_nodes) end = physnodes[i].end; - if (setup_node_range(ret++, &physnodes[i].start, + if (setup_node_range(ret++, i, &physnodes[i].start, end - physnodes[i].start, physnodes[i].end) < 0) node_clear(i, physnode_mask); @@ -852,7 +859,7 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) * later. If setup_node_range() returns non-zero, there * is no more memory available on this physical node. */ - if (setup_node_range(ret++, &physnodes[i].start, + if (setup_node_range(ret++, i, &physnodes[i].start, end - physnodes[i].start, physnodes[i].end) < 0) node_clear(i, physnode_mask); @@ -872,6 +879,9 @@ static int __init numa_emulation(int acpi, int amd) int num_nodes; int i; + for (i = 0; i < MAX_NUMNODES; i++) + emu_nid_to_phys[i] = NUMA_NO_NODE; + /* * If the numa=fake command-line contains a 'M' or 'G', it represents * the fixed node size. Otherwise, if it is just a single number N, @@ -892,6 +902,11 @@ static int __init numa_emulation(int acpi, int amd) if (num_nodes < 0) return num_nodes; + /* make sure all emulated nodes are mapped to a physical node */ + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) + if (emu_nid_to_phys[i] == NUMA_NO_NODE) + emu_nid_to_phys[i] = 0; + ei.nr_blks = num_nodes; for (i = 0; i < ei.nr_blks; i++) { ei.blk[i].start = nodes[i].start; @@ -918,7 +933,6 @@ static int __init numa_emulation(int acpi, int amd) init_memory_mapping_high(); for_each_node_mask(i, node_possible_map) setup_node_bootmem(i, nodes[i].start, nodes[i].end); - setup_physnodes(0, max_addr); fake_physnodes(acpi, amd, num_nodes); numa_init_array(); numa_emu_dist = true; @@ -976,7 +990,11 @@ void __init initmem_init(void) setup_physnodes(0, max_pfn << PAGE_SHIFT); if (emu_cmdline && !numa_emulation(i == 0, i == 1)) return; - setup_physnodes(0, max_pfn << PAGE_SHIFT); + + /* not emulating, build identity mapping for numa_add_cpu() */ + for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) + emu_nid_to_phys[j] = j; + nodes_clear(node_possible_map); nodes_clear(node_online_map); #endif @@ -1033,7 +1051,6 @@ int __cpuinit numa_cpu_node(int cpu) # ifndef CONFIG_DEBUG_PER_CPU_MAPS void __cpuinit numa_add_cpu(int cpu) { - unsigned long addr; int physnid, nid; nid = numa_cpu_node(cpu); @@ -1041,26 +1058,15 @@ void __cpuinit numa_add_cpu(int cpu) nid = early_cpu_to_node(cpu); BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); - /* - * Use the starting address of the emulated node to find which physical - * node it is allocated on. - */ - addr = node_start_pfn(nid) << PAGE_SHIFT; - for (physnid = 0; physnid < MAX_NUMNODES; physnid++) - if (addr >= physnodes[physnid].start && - addr < physnodes[physnid].end) - break; + physnid = emu_nid_to_phys[nid]; /* * Map the cpu to each emulated node that is allocated on the physical * node of the cpu's apic id. */ - for_each_online_node(nid) { - addr = node_start_pfn(nid) << PAGE_SHIFT; - if (addr >= physnodes[physnid].start && - addr < physnodes[physnid].end) + for_each_online_node(nid) + if (emu_nid_to_phys[nid] == physnid) cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); - } } void __cpuinit numa_remove_cpu(int cpu) @@ -1073,21 +1079,21 @@ void __cpuinit numa_remove_cpu(int cpu) # else /* !CONFIG_DEBUG_PER_CPU_MAPS */ static void __cpuinit numa_set_cpumask(int cpu, int enable) { - int node = early_cpu_to_node(cpu); struct cpumask *mask; - int i; + int nid, physnid, i; - if (node == NUMA_NO_NODE) { + nid = early_cpu_to_node(cpu); + if (nid == NUMA_NO_NODE) { /* early_cpu_to_node() already emits a warning and trace */ return; } - for_each_online_node(i) { - unsigned long addr; - addr = node_start_pfn(i) << PAGE_SHIFT; - if (addr < physnodes[node].start || - addr >= physnodes[node].end) + physnid = emu_nid_to_phys[nid]; + + for_each_online_node(i) { + if (emu_nid_to_phys[nid] != physnid) continue; + mask = debug_cpumask_set_cpu(cpu, enable); if (!mask) return; From c88aea7a70b0f014f98c695069ba91abc9e9b9a4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 17:11:10 +0100 Subject: [PATCH 286/706] x86-64, NUMA: Make emulation code build numa_meminfo and share the registration path NUMA emulation code built nodes[] array and had its own registration path to set up the emulated nodes. Update it such that it generates emulated numa_meminfo and returns control to initmem_init() and shares the same registration path with non-emulated cases. Because {acpi|amd}_fake_nodes() expect nodes[] parameter, fake_physnodes() now generates nodes[] from numa_meminfo. This will go away with further updates. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/mm/numa_64.c | 173 +++++++++++++++++++++--------------------- 1 file changed, 86 insertions(+), 87 deletions(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index e9919c4d1573..9736204337b8 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -541,7 +541,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) #ifdef CONFIG_NUMA_EMU /* Numa emulation */ -static struct bootnode nodes[MAX_NUMNODES] __initdata; static struct bootnode physnodes[MAX_NUMNODES] __initdata; static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata; @@ -626,9 +625,24 @@ static int __init setup_physnodes(unsigned long start, unsigned long end) return ret; } -static void __init fake_physnodes(int acpi, int amd, int nr_nodes) +static void __init fake_physnodes(int acpi, int amd, + const struct numa_meminfo *ei) { - int i; + static struct bootnode nodes[MAX_NUMNODES] __initdata; + int i, nr_nodes = 0; + + for (i = 0; i < ei->nr_blks; i++) { + int nid = ei->blk[i].nid; + + if (nodes[nid].start == nodes[nid].end) { + nodes[nid].start = ei->blk[i].start; + nodes[nid].end = ei->blk[i].end; + nr_nodes++; + } else { + nodes[nid].start = min(ei->blk[i].start, nodes[nid].start); + nodes[nid].end = max(ei->blk[i].end, nodes[nid].end); + } + } BUG_ON(acpi && amd); #ifdef CONFIG_ACPI_NUMA @@ -645,45 +659,44 @@ static void __init fake_physnodes(int acpi, int amd, int nr_nodes) } /* - * Setups up nid to range from addr to addr + size. If the end - * boundary is greater than max_addr, then max_addr is used instead. - * The return value is 0 if there is additional memory left for - * allocation past addr and -1 otherwise. addr is adjusted to be at - * the end of the node. + * Sets up nid to range from @start to @end. The return value is -errno if + * something went wrong, 0 otherwise. */ -static int __init setup_node_range(int nid, int physnid, - u64 *addr, u64 size, u64 max_addr) +static int __init emu_setup_memblk(struct numa_meminfo *ei, + int nid, int physnid, u64 start, u64 end) { - int ret = 0; - nodes[nid].start = *addr; - *addr += size; - if (*addr >= max_addr) { - *addr = max_addr; - ret = -1; + struct numa_memblk *eb = &ei->blk[ei->nr_blks]; + + if (ei->nr_blks >= NR_NODE_MEMBLKS) { + pr_err("NUMA: Too many emulated memblks, failing emulation\n"); + return -EINVAL; } - nodes[nid].end = *addr; - node_set(nid, node_possible_map); + + ei->nr_blks++; + eb->start = start; + eb->end = end; + eb->nid = nid; if (emu_nid_to_phys[nid] == NUMA_NO_NODE) emu_nid_to_phys[nid] = physnid; printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, - nodes[nid].start, nodes[nid].end, - (nodes[nid].end - nodes[nid].start) >> 20); - return ret; + eb->start, eb->end, (eb->end - eb->start) >> 20); + return 0; } /* * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr * to max_addr. The return value is the number of nodes allocated. */ -static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes) +static int __init split_nodes_interleave(struct numa_meminfo *ei, + u64 addr, u64 max_addr, int nr_nodes) { nodemask_t physnode_mask = NODE_MASK_NONE; u64 size; int big; - int ret = 0; - int i; + int nid = 0; + int i, ret; if (nr_nodes <= 0) return -1; @@ -721,7 +734,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes) u64 end = physnodes[i].start + size; u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); - if (ret < big) + if (nid < big) end += FAKE_NODE_MIN_SIZE; /* @@ -760,16 +773,21 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes) * happen as a result of rounding down each node's size * to FAKE_NODE_MIN_SIZE. */ - if (nodes_weight(physnode_mask) + ret >= nr_nodes) + if (nodes_weight(physnode_mask) + nid >= nr_nodes) end = physnodes[i].end; - if (setup_node_range(ret++, i, &physnodes[i].start, - end - physnodes[i].start, - physnodes[i].end) < 0) + ret = emu_setup_memblk(ei, nid++, i, + physnodes[i].start, + min(end, physnodes[i].end)); + if (ret < 0) + return ret; + + physnodes[i].start = min(end, physnodes[i].end); + if (physnodes[i].start == physnodes[i].end) node_clear(i, physnode_mask); } } - return ret; + return 0; } /* @@ -794,12 +812,13 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) * Sets up fake nodes of `size' interleaved over physical nodes ranging from * `addr' to `max_addr'. The return value is the number of nodes allocated. */ -static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) +static int __init split_nodes_size_interleave(struct numa_meminfo *ei, + u64 addr, u64 max_addr, u64 size) { nodemask_t physnode_mask = NODE_MASK_NONE; u64 min_size; - int ret = 0; - int i; + int nid = 0; + int i, ret; if (!size) return -1; @@ -854,30 +873,31 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) memblock_x86_hole_size(end, physnodes[i].end) < size) end = physnodes[i].end; - /* - * Setup the fake node that will be allocated as bootmem - * later. If setup_node_range() returns non-zero, there - * is no more memory available on this physical node. - */ - if (setup_node_range(ret++, i, &physnodes[i].start, - end - physnodes[i].start, - physnodes[i].end) < 0) + ret = emu_setup_memblk(ei, nid++, i, + physnodes[i].start, + min(end, physnodes[i].end)); + if (ret < 0) + return ret; + + physnodes[i].start = min(end, physnodes[i].end); + if (physnodes[i].start == physnodes[i].end) node_clear(i, physnode_mask); } } - return ret; + return 0; } /* * Sets up the system RAM area from start_pfn to last_pfn according to the * numa=fake command-line option. */ -static int __init numa_emulation(int acpi, int amd) +static bool __init numa_emulation(int acpi, int amd) { static struct numa_meminfo ei __initdata; const u64 max_addr = max_pfn << PAGE_SHIFT; - int num_nodes; - int i; + int i, ret; + + memset(&ei, 0, sizeof(ei)); for (i = 0; i < MAX_NUMNODES; i++) emu_nid_to_phys[i] = NUMA_NO_NODE; @@ -891,52 +911,33 @@ static int __init numa_emulation(int acpi, int amd) u64 size; size = memparse(emu_cmdline, &emu_cmdline); - num_nodes = split_nodes_size_interleave(0, max_addr, size); + ret = split_nodes_size_interleave(&ei, 0, max_addr, size); } else { unsigned long n; n = simple_strtoul(emu_cmdline, NULL, 0); - num_nodes = split_nodes_interleave(0, max_addr, n); + ret = split_nodes_interleave(&ei, 0, max_addr, n); } - if (num_nodes < 0) - return num_nodes; + if (ret < 0) + return false; + + if (numa_cleanup_meminfo(&ei) < 0) { + pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); + return false; + } + + /* commit */ + numa_meminfo = ei; /* make sure all emulated nodes are mapped to a physical node */ for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) if (emu_nid_to_phys[i] == NUMA_NO_NODE) emu_nid_to_phys[i] = 0; - ei.nr_blks = num_nodes; - for (i = 0; i < ei.nr_blks; i++) { - ei.blk[i].start = nodes[i].start; - ei.blk[i].end = nodes[i].end; - ei.blk[i].nid = i; - } - - memnode_shift = compute_hash_shift(&ei); - if (memnode_shift < 0) { - memnode_shift = 0; - printk(KERN_ERR "No NUMA hash function found. NUMA emulation " - "disabled.\n"); - return -1; - } - - /* - * We need to vacate all active ranges that may have been registered for - * the e820 memory map. - */ - remove_all_active_ranges(); - for_each_node_mask(i, node_possible_map) - memblock_x86_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, - nodes[i].end >> PAGE_SHIFT); - init_memory_mapping_high(); - for_each_node_mask(i, node_possible_map) - setup_node_bootmem(i, nodes[i].start, nodes[i].end); - fake_physnodes(acpi, amd, num_nodes); - numa_init_array(); + fake_physnodes(acpi, amd, &ei); numa_emu_dist = true; - return 0; + return true; } #endif /* CONFIG_NUMA_EMU */ @@ -988,15 +989,13 @@ void __init initmem_init(void) continue; #ifdef CONFIG_NUMA_EMU setup_physnodes(0, max_pfn << PAGE_SHIFT); - if (emu_cmdline && !numa_emulation(i == 0, i == 1)) - return; - - /* not emulating, build identity mapping for numa_add_cpu() */ - for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) - emu_nid_to_phys[j] = j; - - nodes_clear(node_possible_map); - nodes_clear(node_online_map); + /* + * If requested, try emulation. If emulation is not used, + * build identity emu_nid_to_phys[] for numa_add_cpu() + */ + if (!emu_cmdline || !numa_emulation(i == 0, i == 1)) + for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) + emu_nid_to_phys[j] = j; #endif if (numa_register_memblks(&numa_meminfo) < 0) continue; From 775ee85d7bff8ce7c7eccde90eda400658b650a3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 17:11:10 +0100 Subject: [PATCH 287/706] x86-64, NUMA: Wrap node ID during emulation Both emulation layout functions - split_nodes[_size]_interleave() - didn't wrap emulated nid while laying out the fake nodes and tried to avoid interating over the specified number of nodes, which is fragile. Now that the emulation code generates numa_meminfo, the node memblks don't need to be consecutive and emulated node IDs can simply wrap. This makes the code more robust and is necessary for updates to better handle the cases where the physical nodes are interleaved. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/mm/numa_64.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 9736204337b8..dc9516587cf5 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -768,15 +768,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei, memblock_x86_hole_size(end, physnodes[i].end) < size) end = physnodes[i].end; - /* - * Avoid allocating more nodes than requested, which can - * happen as a result of rounding down each node's size - * to FAKE_NODE_MIN_SIZE. - */ - if (nodes_weight(physnode_mask) + nid >= nr_nodes) - end = physnodes[i].end; - - ret = emu_setup_memblk(ei, nid++, i, + ret = emu_setup_memblk(ei, nid++ % nr_nodes, i, physnodes[i].start, min(end, physnodes[i].end)); if (ret < 0) @@ -873,7 +865,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, memblock_x86_hole_size(end, physnodes[i].end) < size) end = physnodes[i].end; - ret = emu_setup_memblk(ei, nid++, i, + ret = emu_setup_memblk(ei, nid++ % MAX_NUMNODES, i, physnodes[i].start, min(end, physnodes[i].end)); if (ret < 0) From 1cca53407336fb6a86092e36dbc5c1e4d45d912b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 17:11:10 +0100 Subject: [PATCH 288/706] x86-64, NUMA: Emulate directly from numa_meminfo NUMA emulation built physnodes[] array which could only represent configurations from the physical meminfo and emulated nodes using the information. There's no reason to take this extra level of indirection. Update emulation functions so that they operate directly on numa_meminfo. This simplifies the code and makes emulation layout behave better with interleaved physical nodes. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/mm/numa_64.c | 171 ++++++++++++++++++------------------------ 1 file changed, 71 insertions(+), 100 deletions(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index dc9516587cf5..bd086ebc0ffc 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -541,8 +541,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) #ifdef CONFIG_NUMA_EMU /* Numa emulation */ -static struct bootnode physnodes[MAX_NUMNODES] __initdata; - static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata; static char *emu_cmdline __initdata; @@ -551,6 +549,16 @@ void __init numa_emu_cmdline(char *str) emu_cmdline = str; } +static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) +{ + int i; + + for (i = 0; i < mi->nr_blks; i++) + if (mi->blk[i].nid == nid) + return i; + return -ENOENT; +} + int __init find_node_by_addr(unsigned long addr) { const struct numa_meminfo *mi = &numa_meminfo; @@ -568,63 +576,6 @@ int __init find_node_by_addr(unsigned long addr) return NUMA_NO_NODE; } -static int __init setup_physnodes(unsigned long start, unsigned long end) -{ - const struct numa_meminfo *mi = &numa_meminfo; - int ret = 0; - int i; - - memset(physnodes, 0, sizeof(physnodes)); - - for (i = 0; i < mi->nr_blks; i++) { - int nid = mi->blk[i].nid; - - if (physnodes[nid].start == physnodes[nid].end) { - physnodes[nid].start = mi->blk[i].start; - physnodes[nid].end = mi->blk[i].end; - } else { - physnodes[nid].start = min(physnodes[nid].start, - mi->blk[i].start); - physnodes[nid].end = max(physnodes[nid].end, - mi->blk[i].end); - } - } - - /* - * Basic sanity checking on the physical node map: there may be errors - * if the SRAT or AMD code incorrectly reported the topology or the mem= - * kernel parameter is used. - */ - for (i = 0; i < MAX_NUMNODES; i++) { - if (physnodes[i].start == physnodes[i].end) - continue; - if (physnodes[i].start > end) { - physnodes[i].end = physnodes[i].start; - continue; - } - if (physnodes[i].end < start) { - physnodes[i].start = physnodes[i].end; - continue; - } - if (physnodes[i].start < start) - physnodes[i].start = start; - if (physnodes[i].end > end) - physnodes[i].end = end; - ret++; - } - - /* - * If no physical topology was detected, a single node is faked to cover - * the entire address space. - */ - if (!ret) { - physnodes[ret].start = start; - physnodes[ret].end = end; - ret = 1; - } - return ret; -} - static void __init fake_physnodes(int acpi, int amd, const struct numa_meminfo *ei) { @@ -663,9 +614,11 @@ static void __init fake_physnodes(int acpi, int amd, * something went wrong, 0 otherwise. */ static int __init emu_setup_memblk(struct numa_meminfo *ei, - int nid, int physnid, u64 start, u64 end) + struct numa_meminfo *pi, + int nid, int phys_blk, u64 size) { struct numa_memblk *eb = &ei->blk[ei->nr_blks]; + struct numa_memblk *pb = &pi->blk[phys_blk]; if (ei->nr_blks >= NR_NODE_MEMBLKS) { pr_err("NUMA: Too many emulated memblks, failing emulation\n"); @@ -673,12 +626,18 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei, } ei->nr_blks++; - eb->start = start; - eb->end = end; + eb->start = pb->start; + eb->end = pb->start + size; eb->nid = nid; if (emu_nid_to_phys[nid] == NUMA_NO_NODE) - emu_nid_to_phys[nid] = physnid; + emu_nid_to_phys[nid] = pb->nid; + + pb->start += size; + if (pb->start >= pb->end) { + WARN_ON_ONCE(pb->start > pb->end); + numa_remove_memblk_from(phys_blk, pi); + } printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, eb->start, eb->end, (eb->end - eb->start) >> 20); @@ -690,6 +649,7 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei, * to max_addr. The return value is the number of nodes allocated. */ static int __init split_nodes_interleave(struct numa_meminfo *ei, + struct numa_meminfo *pi, u64 addr, u64 max_addr, int nr_nodes) { nodemask_t physnode_mask = NODE_MASK_NONE; @@ -721,9 +681,8 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei, return -1; } - for (i = 0; i < MAX_NUMNODES; i++) - if (physnodes[i].start != physnodes[i].end) - node_set(i, physnode_mask); + for (i = 0; i < pi->nr_blks; i++) + node_set(pi->blk[i].nid, physnode_mask); /* * Continue to fill physical nodes with fake nodes until there is no @@ -731,8 +690,18 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei, */ while (nodes_weight(physnode_mask)) { for_each_node_mask(i, physnode_mask) { - u64 end = physnodes[i].start + size; u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); + u64 start, limit, end; + int phys_blk; + + phys_blk = emu_find_memblk_by_nid(i, pi); + if (phys_blk < 0) { + node_clear(i, physnode_mask); + continue; + } + start = pi->blk[phys_blk].start; + limit = pi->blk[phys_blk].end; + end = start + size; if (nid < big) end += FAKE_NODE_MIN_SIZE; @@ -741,11 +710,11 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei, * Continue to add memory to this fake node if its * non-reserved memory is less than the per-node size. */ - while (end - physnodes[i].start - - memblock_x86_hole_size(physnodes[i].start, end) < size) { + while (end - start - + memblock_x86_hole_size(start, end) < size) { end += FAKE_NODE_MIN_SIZE; - if (end > physnodes[i].end) { - end = physnodes[i].end; + if (end > limit) { + end = limit; break; } } @@ -764,19 +733,15 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei, * next node, this one must extend to the end of the * physical node. */ - if (physnodes[i].end - end - - memblock_x86_hole_size(end, physnodes[i].end) < size) - end = physnodes[i].end; + if (limit - end - + memblock_x86_hole_size(end, limit) < size) + end = limit; - ret = emu_setup_memblk(ei, nid++ % nr_nodes, i, - physnodes[i].start, - min(end, physnodes[i].end)); + ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, + phys_blk, + min(end, limit) - start); if (ret < 0) return ret; - - physnodes[i].start = min(end, physnodes[i].end); - if (physnodes[i].start == physnodes[i].end) - node_clear(i, physnode_mask); } } return 0; @@ -805,6 +770,7 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) * `addr' to `max_addr'. The return value is the number of nodes allocated. */ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, + struct numa_meminfo *pi, u64 addr, u64 max_addr, u64 size) { nodemask_t physnode_mask = NODE_MASK_NONE; @@ -833,9 +799,9 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, } size &= FAKE_NODE_MIN_HASH_MASK; - for (i = 0; i < MAX_NUMNODES; i++) - if (physnodes[i].start != physnodes[i].end) - node_set(i, physnode_mask); + for (i = 0; i < pi->nr_blks; i++) + node_set(pi->blk[i].nid, physnode_mask); + /* * Fill physical nodes with fake nodes of size until there is no memory * left on any of them. @@ -843,10 +809,18 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, while (nodes_weight(physnode_mask)) { for_each_node_mask(i, physnode_mask) { u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT; - u64 end; + u64 start, limit, end; + int phys_blk; - end = find_end_of_node(physnodes[i].start, - physnodes[i].end, size); + phys_blk = emu_find_memblk_by_nid(i, pi); + if (phys_blk < 0) { + node_clear(i, physnode_mask); + continue; + } + start = pi->blk[phys_blk].start; + limit = pi->blk[phys_blk].end; + + end = find_end_of_node(start, limit, size); /* * If there won't be at least FAKE_NODE_MIN_SIZE of * non-reserved memory in ZONE_DMA32 for the next node, @@ -861,19 +835,15 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, * next node, this one must extend to the end of the * physical node. */ - if (physnodes[i].end - end - - memblock_x86_hole_size(end, physnodes[i].end) < size) - end = physnodes[i].end; + if (limit - end - + memblock_x86_hole_size(end, limit) < size) + end = limit; - ret = emu_setup_memblk(ei, nid++ % MAX_NUMNODES, i, - physnodes[i].start, - min(end, physnodes[i].end)); + ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, + phys_blk, + min(end, limit) - start); if (ret < 0) return ret; - - physnodes[i].start = min(end, physnodes[i].end); - if (physnodes[i].start == physnodes[i].end) - node_clear(i, physnode_mask); } } return 0; @@ -886,10 +856,12 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, static bool __init numa_emulation(int acpi, int amd) { static struct numa_meminfo ei __initdata; + static struct numa_meminfo pi __initdata; const u64 max_addr = max_pfn << PAGE_SHIFT; int i, ret; memset(&ei, 0, sizeof(ei)); + pi = numa_meminfo; for (i = 0; i < MAX_NUMNODES; i++) emu_nid_to_phys[i] = NUMA_NO_NODE; @@ -903,12 +875,12 @@ static bool __init numa_emulation(int acpi, int amd) u64 size; size = memparse(emu_cmdline, &emu_cmdline); - ret = split_nodes_size_interleave(&ei, 0, max_addr, size); + ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size); } else { unsigned long n; n = simple_strtoul(emu_cmdline, NULL, 0); - ret = split_nodes_interleave(&ei, 0, max_addr, n); + ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); } if (ret < 0) @@ -980,7 +952,6 @@ void __init initmem_init(void) if (numa_cleanup_meminfo(&numa_meminfo) < 0) continue; #ifdef CONFIG_NUMA_EMU - setup_physnodes(0, max_pfn << PAGE_SHIFT); /* * If requested, try emulation. If emulation is not used, * build identity emu_nid_to_phys[] for numa_add_cpu() From 6b78cb549b4105cbf7c6f7461f27a21f00c44997 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 17:11:10 +0100 Subject: [PATCH 289/706] x86-64, NUMA: Unify emulated apicid -> node mapping transformation NUMA emulation changes node mappings and thus apicid -> node mapping needs to be updated accordingly. srat_64 and amdtopology_64 did this separately; however, all the necessary information is the mapping from emulated nodes to physical nodes which is available in emu_nid_to_phys[]. Implement common __apicid_to_node[] transformation in numa_emulation() and drop duplicate implementations. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/mm/amdtopology_64.c | 9 --------- arch/x86/mm/numa_64.c | 16 +++++++++++++++- arch/x86/mm/srat_64.c | 24 +----------------------- 3 files changed, 16 insertions(+), 33 deletions(-) diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c index fd7b609025ba..f37ea2fe85e6 100644 --- a/arch/x86/mm/amdtopology_64.c +++ b/arch/x86/mm/amdtopology_64.c @@ -196,10 +196,6 @@ int __init amd_numa_init(void) } #ifdef CONFIG_NUMA_EMU -static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = { - [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE -}; - /* * For NUMA emulation, fake proximity domain (_PXM) to node id mappings must be * setup to represent the physical topology but reflect the emulated @@ -224,20 +220,15 @@ void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes) for (i = 0; i < nr_nodes; i++) { int index; int nid; - int j; nid = find_node_by_addr(nodes[i].start); if (nid == NUMA_NO_NODE) continue; index = nodeids[nid] << bits; - if (fake_apicid_to_node[index + apicid_base] == NUMA_NO_NODE) - for (j = apicid_base; j < cores + apicid_base; j++) - fake_apicid_to_node[index + j] = i; #ifdef CONFIG_ACPI_NUMA __acpi_map_pxm_to_node(nid, i); #endif } - memcpy(__apicid_to_node, fake_apicid_to_node, sizeof(__apicid_to_node)); } #endif /* CONFIG_NUMA_EMU */ diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index bd086ebc0ffc..722039e0948f 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -858,7 +858,7 @@ static bool __init numa_emulation(int acpi, int amd) static struct numa_meminfo ei __initdata; static struct numa_meminfo pi __initdata; const u64 max_addr = max_pfn << PAGE_SHIFT; - int i, ret; + int i, j, ret; memset(&ei, 0, sizeof(ei)); pi = numa_meminfo; @@ -894,6 +894,20 @@ static bool __init numa_emulation(int acpi, int amd) /* commit */ numa_meminfo = ei; + /* + * Transform __apicid_to_node table to use emulated nids by + * reverse-mapping phys_nid. The maps should always exist but fall + * back to zero just in case. + */ + for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) { + if (__apicid_to_node[i] == NUMA_NO_NODE) + continue; + for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) + if (__apicid_to_node[i] == emu_nid_to_phys[j]) + break; + __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0; + } + /* make sure all emulated nodes are mapped to a physical node */ for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) if (emu_nid_to_phys[i] == NUMA_NO_NODE) diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index d2f53f35d86a..d4fbfea53543 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -265,9 +265,6 @@ int __init x86_acpi_numa_init(void) static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = { [0 ... MAX_NUMNODES-1] = PXM_INVAL }; -static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = { - [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE -}; /* * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID @@ -279,7 +276,7 @@ static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = { */ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) { - int i, j; + int i; for (i = 0; i < num_nodes; i++) { int nid, pxm; @@ -291,29 +288,10 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) if (pxm == PXM_INVAL) continue; fake_node_to_pxm_map[i] = pxm; - /* - * For each apicid_to_node mapping that exists for this real - * node, it must now point to the fake node ID. - */ - for (j = 0; j < MAX_LOCAL_APIC; j++) - if (__apicid_to_node[j] == nid && - fake_apicid_to_node[j] == NUMA_NO_NODE) - fake_apicid_to_node[j] = i; } - /* - * If there are apicid-to-node mappings for physical nodes that do not - * have a corresponding emulated node, it should default to a guaranteed - * value. - */ - for (i = 0; i < MAX_LOCAL_APIC; i++) - if (__apicid_to_node[i] != NUMA_NO_NODE && - fake_apicid_to_node[i] == NUMA_NO_NODE) - fake_apicid_to_node[i] = 0; - for (i = 0; i < num_nodes; i++) __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i); - memcpy(__apicid_to_node, fake_apicid_to_node, sizeof(__apicid_to_node)); for (i = 0; i < num_nodes; i++) if (fake_nodes[i].start != fake_nodes[i].end) From e23bba604433a202cd301a976454a90ea6b783ef Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 17:11:10 +0100 Subject: [PATCH 290/706] x86-64, NUMA: Unify emulated distance mapping NUMA emulation needs to update node distance information. It did it by remapping apicid to PXM mapping, even when amdtopology is being used. There is no reason to go through such convolution. The generic code has all the information necessary to transform the distance table to the emulated nid space. Implement generic distance table transformation in numa_emulation() and drop private implementations in srat_64 and amdtopology_64. This makes find_node_by_addr() and fake_physnodes() and related functions unnecessary, drop them. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/include/asm/acpi.h | 6 -- arch/x86/include/asm/amd_nb.h | 4 -- arch/x86/include/asm/numa_64.h | 1 - arch/x86/mm/amdtopology_64.c | 38 ------------ arch/x86/mm/numa_64.c | 102 +++++++++++++-------------------- arch/x86/mm/srat_64.c | 65 --------------------- 6 files changed, 40 insertions(+), 176 deletions(-) diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 9c9fe1b0bc4e..a37da6df07f9 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -186,12 +186,6 @@ struct bootnode; #ifdef CONFIG_ACPI_NUMA extern int acpi_numa; extern int x86_acpi_numa_init(void); - -#ifdef CONFIG_NUMA_EMU -extern void acpi_fake_nodes(const struct bootnode *fake_nodes, - int num_nodes); -extern int acpi_emu_node_distance(int a, int b); -#endif #endif /* CONFIG_ACPI_NUMA */ #define acpi_unlazy_tlb(x) leave_mm(x) diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 384d1188e787..e264ae5a1443 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -20,10 +20,6 @@ extern int amd_numa_init(void); extern int amd_get_subcaches(int); extern int amd_set_subcaches(int, int); -#ifdef CONFIG_NUMA_EMU -extern void amd_fake_nodes(const struct bootnode *nodes, int nr_nodes); -#endif - struct amd_northbridge { struct pci_dev *misc; struct pci_dev *link; diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 5361c5947986..344eb1790b46 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -34,7 +34,6 @@ extern void __init numa_set_distance(int from, int to, int distance); #define FAKE_NODE_MIN_SIZE ((u64)32 << 20) #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) void numa_emu_cmdline(char *); -int __init find_node_by_addr(unsigned long addr); #endif /* CONFIG_NUMA_EMU */ #else static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; } diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c index f37ea2fe85e6..0919c26820d4 100644 --- a/arch/x86/mm/amdtopology_64.c +++ b/arch/x86/mm/amdtopology_64.c @@ -194,41 +194,3 @@ int __init amd_numa_init(void) return 0; } - -#ifdef CONFIG_NUMA_EMU -/* - * For NUMA emulation, fake proximity domain (_PXM) to node id mappings must be - * setup to represent the physical topology but reflect the emulated - * environment. For each emulated node, the real node which it appears on is - * found and a fake pxm to nid mapping is created which mirrors the actual - * locality. node_distance() then represents the correct distances between - * emulated nodes by using the fake acpi mappings to pxms. - */ -void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes) -{ - unsigned int bits; - unsigned int cores; - unsigned int apicid_base = 0; - int i; - - bits = boot_cpu_data.x86_coreid_bits; - cores = 1 << bits; - early_get_boot_cpu_id(); - if (boot_cpu_physical_apicid > 0) - apicid_base = boot_cpu_physical_apicid; - - for (i = 0; i < nr_nodes; i++) { - int index; - int nid; - - nid = find_node_by_addr(nodes[i].start); - if (nid == NUMA_NO_NODE) - continue; - - index = nodeids[nid] << bits; -#ifdef CONFIG_ACPI_NUMA - __acpi_map_pxm_to_node(nid, i); -#endif - } -} -#endif /* CONFIG_NUMA_EMU */ diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 722039e0948f..8ce617735900 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -48,10 +48,6 @@ static struct numa_meminfo numa_meminfo __initdata; static int numa_distance_cnt; static u8 *numa_distance; -#ifdef CONFIG_NUMA_EMU -static bool numa_emu_dist; -#endif - /* * Given a shift value, try to populate memnodemap[] * Returns : @@ -443,10 +439,6 @@ void __init numa_set_distance(int from, int to, int distance) int __node_distance(int from, int to) { -#if defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA_EMU) - if (numa_emu_dist) - return acpi_emu_node_distance(from, to); -#endif if (from >= numa_distance_cnt || to >= numa_distance_cnt) return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; return numa_distance[from * numa_distance_cnt + to]; @@ -559,56 +551,6 @@ static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) return -ENOENT; } -int __init find_node_by_addr(unsigned long addr) -{ - const struct numa_meminfo *mi = &numa_meminfo; - int i; - - for (i = 0; i < mi->nr_blks; i++) { - /* - * Find the real node that this emulated node appears on. For - * the sake of simplicity, we only use a real node's starting - * address to determine which emulated node it appears on. - */ - if (addr >= mi->blk[i].start && addr < mi->blk[i].end) - return mi->blk[i].nid; - } - return NUMA_NO_NODE; -} - -static void __init fake_physnodes(int acpi, int amd, - const struct numa_meminfo *ei) -{ - static struct bootnode nodes[MAX_NUMNODES] __initdata; - int i, nr_nodes = 0; - - for (i = 0; i < ei->nr_blks; i++) { - int nid = ei->blk[i].nid; - - if (nodes[nid].start == nodes[nid].end) { - nodes[nid].start = ei->blk[i].start; - nodes[nid].end = ei->blk[i].end; - nr_nodes++; - } else { - nodes[nid].start = min(ei->blk[i].start, nodes[nid].start); - nodes[nid].end = max(ei->blk[i].end, nodes[nid].end); - } - } - - BUG_ON(acpi && amd); -#ifdef CONFIG_ACPI_NUMA - if (acpi) - acpi_fake_nodes(nodes, nr_nodes); -#endif -#ifdef CONFIG_AMD_NUMA - if (amd) - amd_fake_nodes(nodes, nr_nodes); -#endif - if (!acpi && !amd) - for (i = 0; i < nr_cpu_ids; i++) - numa_set_node(i, 0); -} - /* * Sets up nid to range from @start to @end. The return value is -errno if * something went wrong, 0 otherwise. @@ -853,11 +795,13 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, * Sets up the system RAM area from start_pfn to last_pfn according to the * numa=fake command-line option. */ -static bool __init numa_emulation(int acpi, int amd) +static bool __init numa_emulation(void) { static struct numa_meminfo ei __initdata; static struct numa_meminfo pi __initdata; const u64 max_addr = max_pfn << PAGE_SHIFT; + int phys_dist_cnt = numa_distance_cnt; + u8 *phys_dist = NULL; int i, j, ret; memset(&ei, 0, sizeof(ei)); @@ -891,6 +835,25 @@ static bool __init numa_emulation(int acpi, int amd) return false; } + /* + * Copy the original distance table. It's temporary so no need to + * reserve it. + */ + if (phys_dist_cnt) { + size_t size = phys_dist_cnt * sizeof(numa_distance[0]); + u64 phys; + + phys = memblock_find_in_range(0, + (u64)max_pfn_mapped << PAGE_SHIFT, + size, PAGE_SIZE); + if (phys == MEMBLOCK_ERROR) { + pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); + return false; + } + phys_dist = __va(phys); + memcpy(phys_dist, numa_distance, size); + } + /* commit */ numa_meminfo = ei; @@ -913,8 +876,23 @@ static bool __init numa_emulation(int acpi, int amd) if (emu_nid_to_phys[i] == NUMA_NO_NODE) emu_nid_to_phys[i] = 0; - fake_physnodes(acpi, amd, &ei); - numa_emu_dist = true; + /* transform distance table */ + numa_reset_distance(); + for (i = 0; i < MAX_NUMNODES; i++) { + for (j = 0; j < MAX_NUMNODES; j++) { + int physi = emu_nid_to_phys[i]; + int physj = emu_nid_to_phys[j]; + int dist; + + if (physi >= phys_dist_cnt || physj >= phys_dist_cnt) + dist = physi == physj ? + LOCAL_DISTANCE : REMOTE_DISTANCE; + else + dist = phys_dist[physi * phys_dist_cnt + physj]; + + numa_set_distance(i, j, dist); + } + } return true; } #endif /* CONFIG_NUMA_EMU */ @@ -970,7 +948,7 @@ void __init initmem_init(void) * If requested, try emulation. If emulation is not used, * build identity emu_nid_to_phys[] for numa_add_cpu() */ - if (!emu_cmdline || !numa_emulation(i == 0, i == 1)) + if (!emu_cmdline || !numa_emulation()) for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) emu_nid_to_phys[j] = j; #endif diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index d4fbfea53543..8e9d3394f6d4 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -26,8 +26,6 @@ int acpi_numa __initdata; -static struct acpi_table_slit *acpi_slit; - static struct bootnode nodes_add[MAX_NUMNODES]; static __init int setup_node(int pxm) @@ -51,25 +49,11 @@ static __init inline int srat_disabled(void) void __init acpi_numa_slit_init(struct acpi_table_slit *slit) { int i, j; - unsigned length; - unsigned long phys; for (i = 0; i < slit->locality_count; i++) for (j = 0; j < slit->locality_count; j++) numa_set_distance(pxm_to_node(i), pxm_to_node(j), slit->entry[slit->locality_count * i + j]); - - /* acpi_slit is used only by emulation */ - length = slit->header.length; - phys = memblock_find_in_range(0, max_pfn_mapped< x2APIC mapping */ @@ -261,55 +245,6 @@ int __init x86_acpi_numa_init(void) return srat_disabled() ? -EINVAL : 0; } -#ifdef CONFIG_NUMA_EMU -static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = { - [0 ... MAX_NUMNODES-1] = PXM_INVAL -}; - -/* - * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID - * mappings that respect the real ACPI topology but reflect our emulated - * environment. For each emulated node, we find which real node it appears on - * and create PXM to NID mappings for those fake nodes which mirror that - * locality. SLIT will now represent the correct distances between emulated - * nodes as a result of the real topology. - */ -void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) -{ - int i; - - for (i = 0; i < num_nodes; i++) { - int nid, pxm; - - nid = find_node_by_addr(fake_nodes[i].start); - if (nid == NUMA_NO_NODE) - continue; - pxm = node_to_pxm(nid); - if (pxm == PXM_INVAL) - continue; - fake_node_to_pxm_map[i] = pxm; - } - - for (i = 0; i < num_nodes; i++) - __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i); - - for (i = 0; i < num_nodes; i++) - if (fake_nodes[i].start != fake_nodes[i].end) - node_set(i, numa_nodes_parsed); -} - -int acpi_emu_node_distance(int a, int b) -{ - int index; - - if (!acpi_slit) - return node_to_pxm(a) == node_to_pxm(b) ? - LOCAL_DISTANCE : REMOTE_DISTANCE; - index = acpi_slit->locality_count * node_to_pxm(a); - return acpi_slit->entry[index + node_to_pxm(b)]; -} -#endif /* CONFIG_NUMA_EMU */ - #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY) int memory_add_physaddr_to_nid(u64 start) { From 5c35d69fb60b1dc49595f5b9a2c7158283e9eaf3 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 9 Feb 2011 11:38:43 -0200 Subject: [PATCH 291/706] perf ui: Serialize screen updates The ui operations so far were used by just one thread, but 'perf top --tui' now has two threads updating the screen, so we need to use a mutex to avoid garbling the screen. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile | 1 + tools/perf/util/ui/browser.c | 7 +++++++ tools/perf/util/ui/helpline.c | 5 ++++- tools/perf/util/ui/setup.c | 3 +++ tools/perf/util/ui/ui.h | 8 ++++++++ 5 files changed, 23 insertions(+), 1 deletion(-) create mode 100644 tools/perf/util/ui/ui.h diff --git a/tools/perf/Makefile b/tools/perf/Makefile index bc4d9bf8a556..ffd1047cd93b 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -637,6 +637,7 @@ else LIB_H += util/ui/libslang.h LIB_H += util/ui/progress.h LIB_H += util/ui/util.h + LIB_H += util/ui/ui.h endif endif diff --git a/tools/perf/util/ui/browser.c b/tools/perf/util/ui/browser.c index 8bc010edca25..60d6c815e1db 100644 --- a/tools/perf/util/ui/browser.c +++ b/tools/perf/util/ui/browser.c @@ -1,4 +1,5 @@ #include "libslang.h" +#include "ui.h" #include #include #include @@ -178,6 +179,7 @@ int ui_browser__show(struct ui_browser *self, const char *title, if (self->sb == NULL) return -1; + pthread_mutex_lock(&ui__lock); SLsmg_gotorc(0, 0); ui_browser__set_color(self, NEWT_COLORSET_ROOT); slsmg_write_nstring(title, self->width); @@ -188,25 +190,30 @@ int ui_browser__show(struct ui_browser *self, const char *title, va_start(ap, helpline); ui_helpline__vpush(helpline, ap); va_end(ap); + pthread_mutex_unlock(&ui__lock); return 0; } void ui_browser__hide(struct ui_browser *self) { + pthread_mutex_lock(&ui__lock); newtFormDestroy(self->form); self->form = NULL; ui_helpline__pop(); + pthread_mutex_unlock(&ui__lock); } int ui_browser__refresh(struct ui_browser *self) { int row; + pthread_mutex_lock(&ui__lock); newtScrollbarSet(self->sb, self->index, self->nr_entries - 1); row = self->refresh(self); ui_browser__set_color(self, HE_COLORSET_NORMAL); SLsmg_fill_region(self->y + row, self->x, self->height - row, self->width, ' '); + pthread_mutex_unlock(&ui__lock); return 0; } diff --git a/tools/perf/util/ui/helpline.c b/tools/perf/util/ui/helpline.c index 8d79daa4458a..f36d2ff509ed 100644 --- a/tools/perf/util/ui/helpline.c +++ b/tools/perf/util/ui/helpline.c @@ -5,6 +5,7 @@ #include "../debug.h" #include "helpline.h" +#include "ui.h" void ui_helpline__pop(void) { @@ -55,7 +56,8 @@ int ui_helpline__show_help(const char *format, va_list ap) int ret; static int backlog; - ret = vsnprintf(ui_helpline__last_msg + backlog, + pthread_mutex_lock(&ui__lock); + ret = vsnprintf(ui_helpline__last_msg + backlog, sizeof(ui_helpline__last_msg) - backlog, format, ap); backlog += ret; @@ -64,6 +66,7 @@ int ui_helpline__show_help(const char *format, va_list ap) newtRefresh(); backlog = 0; } + pthread_mutex_unlock(&ui__lock); return ret; } diff --git a/tools/perf/util/ui/setup.c b/tools/perf/util/ui/setup.c index fbf1a145492f..ee46d671db59 100644 --- a/tools/perf/util/ui/setup.c +++ b/tools/perf/util/ui/setup.c @@ -6,6 +6,9 @@ #include "../debug.h" #include "browser.h" #include "helpline.h" +#include "ui.h" + +pthread_mutex_t ui__lock = PTHREAD_MUTEX_INITIALIZER; static void newt_suspend(void *d __used) { diff --git a/tools/perf/util/ui/ui.h b/tools/perf/util/ui/ui.h new file mode 100644 index 000000000000..d264e059c829 --- /dev/null +++ b/tools/perf/util/ui/ui.h @@ -0,0 +1,8 @@ +#ifndef _PERF_UI_H_ +#define _PERF_UI_H_ 1 + +#include + +extern pthread_mutex_t ui__lock; + +#endif /* _PERF_UI_H_ */ From 289c082044643e55f65c6a16bb3621cf3f35a454 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 9 Feb 2011 13:56:28 -0200 Subject: [PATCH 292/706] perf annotate: Check if offset is less than symbol size Just like done on symbol__inc_addr_samples to catch misparsed offsets from objdump. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 02976b895f27..70ec422ddb64 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -541,11 +541,12 @@ void symbol__annotate_decay_histogram(struct symbol *sym, int evidx) struct annotation *notes = symbol__annotation(sym); struct sym_hist *h = annotation__histogram(notes, evidx); struct objdump_line *pos; + int len = sym->end - sym->start; h->sum = 0; list_for_each_entry(pos, ¬es->src->source, node) { - if (pos->offset != -1) { + if (pos->offset != -1 && pos->offset < len) { h->addr[pos->offset] = h->addr[pos->offset] * 7 / 8; h->sum += h->addr[pos->offset]; } From b99976e2d277c963138e090ae17bf835f8a07680 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 9 Feb 2011 13:59:14 -0200 Subject: [PATCH 293/706] perf annotate browser: Use the percent color for the whole line Not just for the percentage number, to see the hot lines more easily. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/ui/browsers/annotate.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/ui/browsers/annotate.c b/tools/perf/util/ui/browsers/annotate.c index 1aa39658539c..cfb5a27f15d2 100644 --- a/tools/perf/util/ui/browsers/annotate.c +++ b/tools/perf/util/ui/browsers/annotate.c @@ -44,8 +44,6 @@ static void annotate_browser__write(struct ui_browser *self, void *entry, int ro struct objdump_line_rb_node *olrb = objdump_line__rb(ol); ui_browser__set_percent_color(self, olrb->percent, current_entry); slsmg_printf(" %7.2f ", olrb->percent); - if (!current_entry) - ui_browser__set_color(self, HE_COLORSET_CODE); } else { ui_browser__set_percent_color(self, 0, current_entry); slsmg_write_nstring(" ", 9); @@ -57,6 +55,9 @@ static void annotate_browser__write(struct ui_browser *self, void *entry, int ro slsmg_write_nstring(" ", width - 18); else slsmg_write_nstring(ol->line, width - 18); + + if (!current_entry) + ui_browser__set_color(self, HE_COLORSET_CODE); } static double objdump_line__calc_percent(struct objdump_line *self, From 4187e262bc90369ba581ee28ec74ed416618889e Mon Sep 17 00:00:00 2001 From: Jesse Brandeburg Date: Wed, 9 Feb 2011 17:11:00 -0800 Subject: [PATCH 294/706] perf tools: Update Makefile with some help The perf makefile is nicely complete except for a) an uninstall option b) a 'make help' description This patch implements b) it also comments out other non-working makefile targets Signed-off-by: Jesse Brandeburg LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/Makefile | 19 ++++++++++--------- tools/perf/Makefile | 30 ++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 9 deletions(-) diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile index bd498d496952..4626a398836a 100644 --- a/tools/perf/Documentation/Makefile +++ b/tools/perf/Documentation/Makefile @@ -178,8 +178,8 @@ install-pdf: pdf $(INSTALL) -d -m 755 $(DESTDIR)$(pdfdir) $(INSTALL) -m 644 user-manual.pdf $(DESTDIR)$(pdfdir) -install-html: html - '$(SHELL_PATH_SQ)' ./install-webdoc.sh $(DESTDIR)$(htmldir) +#install-html: html +# '$(SHELL_PATH_SQ)' ./install-webdoc.sh $(DESTDIR)$(htmldir) ../PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE $(QUIET_SUBDIR0)../ $(QUIET_SUBDIR1) PERF-VERSION-FILE @@ -288,15 +288,16 @@ $(patsubst %.txt,%.html,$(wildcard howto/*.txt)): %.html : %.txt sed -e '1,/^$$/d' $< | $(ASCIIDOC) -b xhtml11 - >$@+ && \ mv $@+ $@ -install-webdoc : html - '$(SHELL_PATH_SQ)' ./install-webdoc.sh $(WEBDOC_DEST) +# UNIMPLEMENTED +#install-webdoc : html +# '$(SHELL_PATH_SQ)' ./install-webdoc.sh $(WEBDOC_DEST) -quick-install: quick-install-man +# quick-install: quick-install-man -quick-install-man: - '$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(DOC_REF) $(DESTDIR)$(mandir) +# quick-install-man: +# '$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(DOC_REF) $(DESTDIR)$(mandir) -quick-install-html: - '$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(HTML_REF) $(DESTDIR)$(htmldir) +#quick-install-html: +# '$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(HTML_REF) $(DESTDIR)$(htmldir) .PHONY: .FORCE-PERF-VERSION-FILE diff --git a/tools/perf/Makefile b/tools/perf/Makefile index ffd1047cd93b..7c75f1d45f59 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -1102,6 +1102,36 @@ $(sort $(dir $(DIRECTORY_DEPS))): $(LIB_FILE): $(LIB_OBJS) $(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIB_OBJS) +help: + @echo 'Perf make targets:' + @echo ' doc - make *all* documentation (see below)' + @echo ' man - make manpage documentation (access with man )' + @echo ' html - make html documentation' + @echo ' info - make GNU info documentation (access with info )' + @echo ' pdf - make pdf documentation' + @echo ' TAGS - use etags to make tag information for source browsing' + @echo ' tags - use ctags to make tag information for source browsing' + @echo ' cscope - use cscope to make interactive browsing database' + @echo '' + @echo 'Perf install targets:' + @echo ' NOTE: documentation build requires asciidoc, xmlto packages to be installed' + @echo ' HINT: use "make prefix= " to install to a particular' + @echo ' path like make prefix=/usr/local install install-doc' + @echo ' install - install compiled binaries' + @echo ' install-doc - install *all* documentation' + @echo ' install-man - install manpage documentation' + @echo ' install-html - install html documentation' + @echo ' install-info - install GNU info documentation' + @echo ' install-pdf - install pdf documentation' + @echo '' + @echo ' quick-install-doc - alias for quick-install-man' + @echo ' quick-install-man - install the documentation quickly' + @echo ' quick-install-html - install the html documentation quickly' + @echo '' + @echo 'Perf maintainer targets:' + @echo ' distclean - alias to clean' + @echo ' clean - clean all binary objects and build output' + doc: $(MAKE) -C Documentation all From e116dfa1c357da49f55e1555767ec991225a8321 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 10 Feb 2011 18:08:10 +0900 Subject: [PATCH 295/706] perf probe: Support function@filename syntax for --line Since "perf probe --add" supports function@filename syntax, --line option should also support it. Cc: 2nddept-manager@sdl.hitachi.co.jp Cc: Franck Bui-Huu Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org LKML-Reference: <20110210090810.1809.26913.stgit@ltc236.sdl.hitachi.co.jp> Signed-off-by: Masami Hiramatsu Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-probe.txt | 7 ++++--- tools/perf/util/probe-event.c | 15 ++++++++++++--- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt index 81c3220e04f3..02bafce4b341 100644 --- a/tools/perf/Documentation/perf-probe.txt +++ b/tools/perf/Documentation/perf-probe.txt @@ -16,7 +16,7 @@ or or 'perf probe' --list or -'perf probe' [options] --line='FUNC[:RLN[+NUM|:RLN2]]|SRC:ALN[+NUM|:ALN2]' +'perf probe' [options] --line='LINE' or 'perf probe' [options] --vars='PROBEPOINT' @@ -128,13 +128,14 @@ LINE SYNTAX ----------- Line range is described by following syntax. - "FUNC[:RLN[+NUM|-RLN2]]|SRC[:ALN[+NUM|-ALN2]]" + "FUNC[@SRC][:RLN[+NUM|-RLN2]]|SRC[:ALN[+NUM|-ALN2]]" FUNC specifies the function name of showing lines. 'RLN' is the start line number from function entry line, and 'RLN2' is the end line number. As same as probe syntax, 'SRC' means the source file path, 'ALN' is start line number, and 'ALN2' is end line number in the file. It is also possible to specify how -many lines to show by using 'NUM'. +many lines to show by using 'NUM'. Moreover, 'FUNC@SRC' combination is good +for searching a specific function when several functions share same name. So, "source.c:100-120" shows lines between 100th to l20th in source.c file. And "func:10+20" shows 20 lines from 10th line of func function. LAZY MATCHING diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 9d237e3cff5d..cbd76507a56f 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -595,11 +595,11 @@ static int parse_line_num(char **ptr, int *val, const char *what) * The line range syntax is described by: * * SRC[:SLN[+NUM|-ELN]] - * FNC[:SLN[+NUM|-ELN]] + * FNC[@SRC][:SLN[+NUM|-ELN]] */ int parse_line_range_desc(const char *arg, struct line_range *lr) { - char *range, *name = strdup(arg); + char *range, *file, *name = strdup(arg); int err; if (!name) @@ -649,7 +649,16 @@ int parse_line_range_desc(const char *arg, struct line_range *lr) } } - if (strchr(name, '.')) + file = strchr(name, '@'); + if (file) { + *file = '\0'; + lr->file = strdup(++file); + if (lr->file == NULL) { + err = -ENOMEM; + goto err; + } + lr->function = name; + } else if (strchr(name, '.')) lr->file = name; else lr->function = name; From 8737ebdea02315eaffaebb3b73d55f2f726a4fe0 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 10 Feb 2011 18:08:16 +0900 Subject: [PATCH 296/706] perf probe: Show filename which contains target function Show filename which contains a target function with the function name on "--lines" mode, because perf-probe just shows the first function even if there are many same-name functions. Originally adopted by Franck Bui-Huu's patch which shows file name instead of function name. I've just modified it to show both of function name and file name, because of completeness of output. E.g.) $ perf probe -L t_show 0 static int t_show(struct seq_file *m, void *v) 1 { 2 struct ftrace_iterator *iter = m->private; ... $ perf probe -L t_show@trace/trace.c 0 static int t_show(struct seq_file *m, void *v) 1 { struct tracer *t = v; ... Original-patch-by: Franck Bui-Huu Cc: 2nddept-manager@sdl.hitachi.co.jp Cc: Franck Bui-Huu Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra LKML-Reference: <20110210090816.1809.43426.stgit@ltc236.sdl.hitachi.co.jp> Signed-off-by: Masami Hiramatsu Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index cbd76507a56f..0e3ea1321103 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -409,7 +409,7 @@ int show_line_range(struct line_range *lr, const char *module) setup_pager(); if (lr->function) - fprintf(stdout, "<%s:%d>\n", lr->function, + fprintf(stdout, "<%s@%s:%d>\n", lr->function, lr->path, lr->start - lr->offset); else fprintf(stdout, "<%s:%d>\n", lr->path, lr->start); From 4498062e72fd55b2a9a4ac1b44fab8cb44ad5367 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 17 Feb 2011 10:07:42 -0200 Subject: [PATCH 297/706] perf python: Add cgroup.c to setup.py to get it building again The 023695d cset added a new file, util/cgroup.c, that is referenced from util/evsel.c, so it needs to be present in util/setup.py so that the python shared object binding works, fixing this: [root@emilia linux]# export PYTHONPATH=~acme/git/build/perf/python/ [root@emilia linux]# ./tools/perf/python/twatch.py Traceback (most recent call last): File "./tools/perf/python/twatch.py", line 16, in import perf ImportError: /home/acme/git/build/perf/python/perf.so: undefined symbol: close_cgroup Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/setup.py b/tools/perf/util/setup.py index 1947b0430c94..e24ffadb20b2 100644 --- a/tools/perf/util/setup.py +++ b/tools/perf/util/setup.py @@ -5,7 +5,7 @@ from distutils.core import setup, Extension perf = Extension('perf', sources = ['util/python.c', 'util/ctype.c', 'util/evlist.c', 'util/evsel.c', 'util/cpumap.c', 'util/thread_map.c', - 'util/util.c', 'util/xyarray.c'], + 'util/util.c', 'util/xyarray.c', 'util/cgroup.c'], include_dirs = ['util/include'], extra_compile_args = ['-fno-strict-aliasing', '-Wno-write-strings']) From f0c55bcf4aa41b4b1dbee826513b1acb01bf65e1 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 16 Feb 2011 15:10:01 +0200 Subject: [PATCH 298/706] perf: make perf stat print user provided full event names This patch changes the way perf stat prints event names at the end of a run. Until now, it was trying to reconstruct the event name from its encoding. The problem is that it would only print generic events without their modifiers (u, k, pp). This patch saves the event name as passed by the user in the evsel struct and uses it to print the final event name. This would also work in case perf is linked with a library (such as libpfm4) which provides full PMU event tables. $ perf stat -e cycles:u,cycles:k date Wed Feb 16 14:58:52 CET 2011 Performance counter stats for 'date': 568600 cycles:u 2779715 cycles:k 0.001908182 seconds time elapsed Cc: Arun Sharma Cc: David S. Miller Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Robert Richter Cc: Stephane Eranian LPU-Reference: <4d5bdc64.98a1df0a.7aa3.06c2@mx.google.com> Signed-off-by: Stephane Eranian [ committer note: Fixed a merge problem with 023695d "Add cgroup support" ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 1 + tools/perf/util/evsel.h | 7 +++++++ tools/perf/util/parse-events.c | 10 ++++++++++ 3 files changed, 18 insertions(+) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index c974e08d07ab..63cadaf3e208 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -86,6 +86,7 @@ void perf_evsel__delete(struct perf_evsel *evsel) { perf_evsel__exit(evsel); close_cgroup(evsel->cgrp); + free(evsel->name); free(evsel); } diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 1d3d5a3dbe60..f6fc8f651a25 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -37,6 +37,12 @@ struct perf_sample_id { struct perf_evsel *evsel; }; +/** struct perf_evsel - event selector + * + * @name - Can be set to retain the original event name passed by the user, + * so that when showing results in tools such as 'perf stat', we + * show the name used, not some alias. + */ struct perf_evsel { struct list_head node; struct perf_event_attr attr; @@ -45,6 +51,7 @@ struct perf_evsel { struct xyarray *id; struct perf_counts *counts; int idx; + char *name; void *priv; struct cgroup_sel *cgrp; }; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index cf082daa43e3..80a3dd5ef573 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -268,6 +268,9 @@ const char *event_name(struct perf_evsel *evsel) u64 config = evsel->attr.config; int type = evsel->attr.type; + if (evsel->name) + return evsel->name; + return __event_name(type, config); } @@ -782,8 +785,10 @@ int parse_events(const struct option *opt, const char *str, int unset __used) struct perf_evlist *evlist = *(struct perf_evlist **)opt->value; struct perf_event_attr attr; enum event_result ret; + const char *ostr; for (;;) { + ostr = str; memset(&attr, 0, sizeof(attr)); ret = parse_event_symbols(opt, &str, &attr); if (ret == EVT_FAILED) @@ -798,6 +803,11 @@ int parse_events(const struct option *opt, const char *str, int unset __used) if (evsel == NULL) return -1; perf_evlist__add(evlist, evsel); + + evsel->name = calloc(str - ostr + 1, 1); + if (!evsel->name) + return -1; + strncpy(evsel->name, ostr, str - ostr); } if (*str == 0) From da1016df85ed67b6f7dbb765532c54bc35ba08d7 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 16 Feb 2011 23:48:35 +0900 Subject: [PATCH 299/706] x86: Use bitmap library functions Use bitmap_set()/bitmap_clear() to fill/zero a region of a bitmap instead of doing set_bit()/clear_bit() each bit. This change has been tested with ioperm() and there's no change in behavior. Signed-off-by: Akinobu Mita LKML-Reference: <1297867715-20394-1-git-send-email-akinobu.mita@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ioport.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 8eec0ec59af2..8c968974253d 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -14,22 +14,9 @@ #include #include #include +#include #include -/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ -static void set_bitmap(unsigned long *bitmap, unsigned int base, - unsigned int extent, int new_value) -{ - unsigned int i; - - for (i = base; i < base + extent; i++) { - if (new_value) - __set_bit(i, bitmap); - else - __clear_bit(i, bitmap); - } -} - /* * this changes the io permissions bitmap in the current task. */ @@ -69,7 +56,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) */ tss = &per_cpu(init_tss, get_cpu()); - set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); + if (turn_on) + bitmap_clear(t->io_bitmap_ptr, from, num); + else + bitmap_set(t->io_bitmap_ptr, from, num); /* * Search for a (possibly new) maximum. This is simple and stupid, From 2ca230baeb7c61864cab9b53e37a3da28a2ca7e5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 17 Feb 2011 14:46:37 +0100 Subject: [PATCH 300/706] x86-64, NUMA: Don't call __pa() with invalid address in numa_reset_distance() Do not call __pa(numa_distance) if it was not allocated before. Calling with invalid address triggers VIRTUAL_BUG_ON() in __phys_addr() if CONFIG_DEBUG_VIRTUAL. Also reported by Ingo. http://thread.gmane.org/gmane.linux.kernel/1101306/focus=1101785 - v2: Change to check existing path as tj requested. - tj: Description update. Signed-off-by: Yinghai Lu Signed-off-by: Tejun Heo Reported-by: Ingo Molnar --- arch/x86/mm/numa_64.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 8ce617735900..1bd6de4aa714 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -371,11 +371,13 @@ static void __init numa_reset_distance(void) { size_t size; - size = numa_distance_cnt * sizeof(numa_distance[0]); - memblock_x86_free_range(__pa(numa_distance), - __pa(numa_distance) + size); + if (numa_distance_cnt) { + size = numa_distance_cnt * sizeof(numa_distance[0]); + memblock_x86_free_range(__pa(numa_distance), + __pa(numa_distance) + size); + numa_distance_cnt = 0; + } numa_distance = NULL; - numa_distance_cnt = 0; } /* From 6d496f9f232790d44144f3784856290e0b27b8f3 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 17 Feb 2011 14:53:20 +0100 Subject: [PATCH 301/706] x86-64, NUMA: Put dummy_numa_init() in the init section dummy_numa_init() is used only during system boot. Put it in .init like other NUMA init functions. - tj: Description update. Signed-off-by: Yinghai Lu Signed-off-by: Tejun Heo --- arch/x86/mm/numa_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 1bd6de4aa714..f6d85e380471 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -899,7 +899,7 @@ static bool __init numa_emulation(void) } #endif /* CONFIG_NUMA_EMU */ -static int dummy_numa_init(void) +static int __init dummy_numa_init(void) { printk(KERN_INFO "%s\n", numa_off ? "NUMA turned off" : "No NUMA configuration found"); From fec9cbd15b9e99bab9bc50f1ed7e20a1087d7c6d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 17 Feb 2011 10:37:23 -0200 Subject: [PATCH 302/706] perf hists: Print number of samples, not the period sum So that we match the header where we state the number of events with the "Samples" column when using 'perf report -n/--show-nr-samples': [root@emilia ~]# perf record -a sleep 1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.111 MB perf.data (~4860 samples) ] [root@emilia ~]# perf report --stdio --show-nr-samples # Events: 11 cycles # # Overhead Samples Command Shared Object Symbol # ........ .......... ........... .................. ............................ # 16.65% 1 sleep [kernel.kallsyms] [k] unmap_vmas 16.10% 1 perf libpthread-2.12.so [.] __pthread_cleanup_push_defer 15.79% 2 perf [kernel.kallsyms] [k] format_decode 12.88% 1 kworker/1:2 [kernel.kallsyms] [k] cache_reap 10.69% 1 swapper [kernel.kallsyms] [k] _raw_spin_lock 7.55% 1 sleep [kernel.kallsyms] [k] prepare_exec_creds 6.00% 1 perf [jbd2] [k] start_this_handle 5.29% 1 perf [kernel.kallsyms] [k] seq_read 4.75% 1 perf [kernel.kallsyms] [k] get_pid_task 4.30% 1 perf [kernel.kallsyms] [k] _raw_spin_unlock_irqrestore # # (For a higher level overview, try: perf report --sort comm,dso) # [root@emilia ~]# Reported-by: Stephane Eranian Acked-by: Stephane Eranian Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/hist.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 3f437236f193..da2899e8c6f8 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -591,6 +591,7 @@ int hist_entry__snprintf(struct hist_entry *self, char *s, size_t size, { struct sort_entry *se; u64 period, total, period_sys, period_us, period_guest_sys, period_guest_us; + u64 nr_events; const char *sep = symbol_conf.field_sep; int ret; @@ -599,6 +600,7 @@ int hist_entry__snprintf(struct hist_entry *self, char *s, size_t size, if (pair_hists) { period = self->pair ? self->pair->period : 0; + nr_events = self->pair ? self->pair->nr_events : 0; total = pair_hists->stats.total_period; period_sys = self->pair ? self->pair->period_sys : 0; period_us = self->pair ? self->pair->period_us : 0; @@ -606,6 +608,7 @@ int hist_entry__snprintf(struct hist_entry *self, char *s, size_t size, period_guest_us = self->pair ? self->pair->period_guest_us : 0; } else { period = self->period; + nr_events = self->nr_events; total = session_total; period_sys = self->period_sys; period_us = self->period_us; @@ -646,9 +649,9 @@ int hist_entry__snprintf(struct hist_entry *self, char *s, size_t size, if (symbol_conf.show_nr_samples) { if (sep) - ret += snprintf(s + ret, size - ret, "%c%" PRIu64, *sep, period); + ret += snprintf(s + ret, size - ret, "%c%" PRIu64, *sep, nr_events); else - ret += snprintf(s + ret, size - ret, "%11" PRIu64, period); + ret += snprintf(s + ret, size - ret, "%11" PRIu64, nr_events); } if (pair_hists) { From 712a4b6049724278121d56aba683151d86c8c35a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 17 Feb 2011 12:18:42 -0200 Subject: [PATCH 303/706] perf record: Delay setting the header writing atexit call While testing the --filter option I noticed that we were writing lots of unneeded stuff to the perf.data header when the filter ioctl fails, so move the atexit(atexit_header) call to after we create the counters successfully. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index a4aaadcb4c8b..db4cd1e7b51a 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -538,11 +538,6 @@ static int __cmd_record(int argc, const char **argv) if (have_tracepoints(&evsel_list->entries)) perf_header__set_feat(&session->header, HEADER_TRACE_INFO); - /* - * perf_session__delete(session) will be called at atexit_header() - */ - atexit(atexit_header); - if (forks) { child_pid = fork(); if (child_pid < 0) { @@ -601,6 +596,11 @@ static int __cmd_record(int argc, const char **argv) perf_session__set_sample_type(session, sample_type); + /* + * perf_session__delete(session) will be called at atexit_header() + */ + atexit(atexit_header); + if (pipe_output) { err = perf_header__write_pipe(output); if (err < 0) From 74cfc17dc1a69c37ce6c8a76c1ce84bcb796eb0e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 17 Feb 2011 14:40:46 -0200 Subject: [PATCH 304/706] perf report: Tell the user when a perf.data file has no samples [root@emilia ~]# perf report --stdio The perf.data file has no samples! [root@emilia ~]# The TUI shows a popup warning message with the same message. Reported-by: Ingo Molnar Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Steven Rostedt Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index f9a99a1ce609..dddcc7ea2bec 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -350,6 +350,12 @@ static int __cmd_report(void) perf_session__fprintf_dsos(session, stdout); next = rb_first(&session->hists_tree); + + if (next == NULL) { + ui__warning("The %s file has no samples!\n", input_name); + goto out_delete; + } + while (next) { struct hists *hists; From 668b8788f497b2386402daeca583d6300240d41d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 17 Feb 2011 15:38:58 -0200 Subject: [PATCH 305/706] perf list: Allow filtering list of events The man page has the details, here are some examples: [root@emilia ~]# perf list *fault* *:*wait* List of pre-defined events (to be used in -e): page-faults OR faults [Software event] minor-faults [Software event] major-faults [Software event] alignment-faults [Software event] emulation-faults [Software event] radeon:radeon_fence_wait_begin [Tracepoint event] radeon:radeon_fence_wait_end [Tracepoint event] writeback:wbc_writeback_wait [Tracepoint event] writeback:wbc_balance_dirty_wait [Tracepoint event] writeback:writeback_congestion_wait [Tracepoint event] writeback:writeback_wait_iff_congested [Tracepoint event] sched:sched_wait_task [Tracepoint event] sched:sched_process_wait [Tracepoint event] sched:sched_stat_wait [Tracepoint event] sched:sched_stat_iowait [Tracepoint event] syscalls:sys_enter_epoll_wait [Tracepoint event] syscalls:sys_exit_epoll_wait [Tracepoint event] syscalls:sys_enter_epoll_pwait [Tracepoint event] syscalls:sys_exit_epoll_pwait [Tracepoint event] syscalls:sys_enter_rt_sigtimedwait [Tracepoint event] syscalls:sys_exit_rt_sigtimedwait [Tracepoint event] syscalls:sys_enter_waitid [Tracepoint event] syscalls:sys_exit_waitid [Tracepoint event] syscalls:sys_enter_wait4 [Tracepoint event] syscalls:sys_exit_wait4 [Tracepoint event] syscalls:sys_enter_waitpid [Tracepoint event] syscalls:sys_exit_waitpid [Tracepoint event] [root@emilia ~]# Suggested-by: Ingo Molnar Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-list.txt | 23 ++++++- tools/perf/builtin-list.c | 43 +++++++++++- tools/perf/util/parse-events.c | 94 +++++++++++++++++++++----- tools/perf/util/parse-events.h | 5 +- 4 files changed, 142 insertions(+), 23 deletions(-) diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt index 399751befeed..7a527f7e9da9 100644 --- a/tools/perf/Documentation/perf-list.txt +++ b/tools/perf/Documentation/perf-list.txt @@ -8,7 +8,7 @@ perf-list - List all symbolic event types SYNOPSIS -------- [verse] -'perf list' +'perf list' [hw|sw|cache|tracepoint|event_glob] DESCRIPTION ----------- @@ -63,7 +63,26 @@ details. Some of them are referenced in the SEE ALSO section below. OPTIONS ------- -None + +Without options all known events will be listed. + +To limit the list use: + +. 'hw' or 'hardware' to list hardware events such as cache-misses, etc. + +. 'sw' or 'software' to list software events such as context switches, etc. + +. 'cache' or 'hwcache' to list hardware cache events such as L1-dcache-loads, etc. + +. 'tracepoint' to list all tracepoint events, alternatively use + 'subsys_glob:event_glob' to filter by tracepoint subsystems such as sched, + block, etc. + +. If none of the above is matched, it will apply the supplied glob to all + events, printing the ones that match. + +One or more types can be used at the same time, listing the events for the +types specified. SEE ALSO -------- diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c index d88c6961274c..6313b6eb3ebb 100644 --- a/tools/perf/builtin-list.c +++ b/tools/perf/builtin-list.c @@ -5,6 +5,7 @@ * * Copyright (C) 2009, Thomas Gleixner * Copyright (C) 2008-2009, Red Hat Inc, Ingo Molnar + * Copyright (C) 2011, Red Hat Inc, Arnaldo Carvalho de Melo */ #include "builtin.h" @@ -13,9 +14,47 @@ #include "util/parse-events.h" #include "util/cache.h" -int cmd_list(int argc __used, const char **argv __used, const char *prefix __used) +int cmd_list(int argc, const char **argv, const char *prefix __used) { setup_pager(); - print_events(); + + if (argc == 1) + print_events(NULL); + else { + int i; + + for (i = 1; i < argc; ++i) { + if (i > 1) + putchar('\n'); + if (strncmp(argv[i], "tracepoint", 10) == 0) + print_tracepoint_events(NULL, NULL); + else if (strcmp(argv[i], "hw") == 0 || + strcmp(argv[i], "hardware") == 0) + print_events_type(PERF_TYPE_HARDWARE); + else if (strcmp(argv[i], "sw") == 0 || + strcmp(argv[i], "software") == 0) + print_events_type(PERF_TYPE_SOFTWARE); + else if (strcmp(argv[i], "cache") == 0 || + strcmp(argv[i], "hwcache") == 0) + print_hwcache_events(NULL); + else { + char *sep = strchr(argv[i], ':'), *s; + int sep_idx; + + if (sep == NULL) { + print_events(argv[i]); + continue; + } + sep_idx = sep - argv[i]; + s = strdup(argv[i]); + if (s == NULL) + return -1; + + s[sep_idx] = '\0'; + print_tracepoint_events(s, s + sep_idx + 1); + free(s); + } + } + } return 0; } diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 80a3dd5ef573..54a7e2634d58 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -858,7 +858,7 @@ static const char * const event_type_descriptors[] = { * Print the events from /tracing/events */ -static void print_tracepoint_events(void) +void print_tracepoint_events(const char *subsys_glob, const char *event_glob) { DIR *sys_dir, *evt_dir; struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent; @@ -873,6 +873,9 @@ static void print_tracepoint_events(void) return; for_each_subsystem(sys_dir, sys_dirent, sys_next) { + if (subsys_glob != NULL && + !strglobmatch(sys_dirent.d_name, subsys_glob)) + continue; snprintf(dir_path, MAXPATHLEN, "%s/%s", debugfs_path, sys_dirent.d_name); @@ -881,6 +884,10 @@ static void print_tracepoint_events(void) continue; for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next) { + if (event_glob != NULL && + !strglobmatch(evt_dirent.d_name, event_glob)) + continue; + snprintf(evt_path, MAXPATHLEN, "%s:%s", sys_dirent.d_name, evt_dirent.d_name); printf(" %-42s [%s]\n", evt_path, @@ -932,13 +939,61 @@ int is_valid_tracepoint(const char *event_string) return 0; } +void print_events_type(u8 type) +{ + struct event_symbol *syms = event_symbols; + unsigned int i; + char name[64]; + + for (i = 0; i < ARRAY_SIZE(event_symbols); i++, syms++) { + if (type != syms->type) + continue; + + if (strlen(syms->alias)) + snprintf(name, sizeof(name), "%s OR %s", + syms->symbol, syms->alias); + else + snprintf(name, sizeof(name), "%s", syms->symbol); + + printf(" %-42s [%s]\n", name, + event_type_descriptors[type]); + } +} + +int print_hwcache_events(const char *event_glob) +{ + unsigned int type, op, i, printed = 0; + + for (type = 0; type < PERF_COUNT_HW_CACHE_MAX; type++) { + for (op = 0; op < PERF_COUNT_HW_CACHE_OP_MAX; op++) { + /* skip invalid cache type */ + if (!is_cache_op_valid(type, op)) + continue; + + for (i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) { + char *name = event_cache_name(type, op, i); + + if (event_glob != NULL && + !strglobmatch(name, event_glob)) + continue; + + printf(" %-42s [%s]\n", name, + event_type_descriptors[PERF_TYPE_HW_CACHE]); + ++printed; + } + } + } + + return printed; +} + /* * Print the help text for the event symbols: */ -void print_events(void) +void print_events(const char *event_glob) { struct event_symbol *syms = event_symbols; - unsigned int i, type, op, prev_type = -1; + unsigned int i, type, prev_type = -1, printed = 0, ntypes_printed = 0; char name[40]; printf("\n"); @@ -947,8 +1002,16 @@ void print_events(void) for (i = 0; i < ARRAY_SIZE(event_symbols); i++, syms++) { type = syms->type; - if (type != prev_type) + if (type != prev_type && printed) { printf("\n"); + printed = 0; + ntypes_printed++; + } + + if (event_glob != NULL && + !(strglobmatch(syms->symbol, event_glob) || + (syms->alias && strglobmatch(syms->alias, event_glob)))) + continue; if (strlen(syms->alias)) sprintf(name, "%s OR %s", syms->symbol, syms->alias); @@ -958,22 +1021,17 @@ void print_events(void) event_type_descriptors[type]); prev_type = type; + ++printed; } - printf("\n"); - for (type = 0; type < PERF_COUNT_HW_CACHE_MAX; type++) { - for (op = 0; op < PERF_COUNT_HW_CACHE_OP_MAX; op++) { - /* skip invalid cache type */ - if (!is_cache_op_valid(type, op)) - continue; - - for (i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) { - printf(" %-42s [%s]\n", - event_cache_name(type, op, i), - event_type_descriptors[PERF_TYPE_HW_CACHE]); - } - } + if (ntypes_printed) { + printed = 0; + printf("\n"); } + print_hwcache_events(event_glob); + + if (event_glob != NULL) + return; printf("\n"); printf(" %-42s [%s]\n", @@ -986,7 +1044,7 @@ void print_events(void) event_type_descriptors[PERF_TYPE_BREAKPOINT]); printf("\n"); - print_tracepoint_events(); + print_tracepoint_events(NULL, NULL); exit(129); } diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index cf7e94abb676..212f88e07a9c 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -28,7 +28,10 @@ extern int parse_filter(const struct option *opt, const char *str, int unset); #define EVENTS_HELP_MAX (128*1024) -extern void print_events(void); +void print_events(const char *event_glob); +void print_events_type(u8 type); +void print_tracepoint_events(const char *subsys_glob, const char *event_glob); +int print_hwcache_events(const char *event_glob); extern int is_valid_tracepoint(const char *event_string); extern char debugfs_path[]; From db2e2e6ee9ee9ce93b04c6975fdfef304771d6ad Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 24 Jan 2011 15:43:03 +0100 Subject: [PATCH 306/706] xen-pcifront: don't use flush_scheduled_work() flush_scheduled_work() is scheduled for deprecation. Cancel ->op_work directly instead. Signed-off-by: Tejun Heo Cc: Ryan Wilson Cc: Jan Beulich Cc: Jesse Barnes Signed-off-by: Konrad Rzeszutek Wilk --- drivers/pci/xen-pcifront.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c index 3a5a6fcc0ead..030ce3743439 100644 --- a/drivers/pci/xen-pcifront.c +++ b/drivers/pci/xen-pcifront.c @@ -733,8 +733,7 @@ static void free_pdev(struct pcifront_device *pdev) pcifront_free_roots(pdev); - /*For PCIE_AER error handling job*/ - flush_scheduled_work(); + cancel_work_sync(&pdev->op_work); if (pdev->irq >= 0) unbind_from_irqhandler(pdev->irq, pdev); From e9345aab675382176740bc8a2c6d3caf1510e46d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 18 Feb 2011 08:09:49 +0100 Subject: [PATCH 307/706] Revert "tracing: Add unstable sched clock note to the warning" This reverts commit 5e38ca8f3ea423442eaafe1b7e206084aa38120a. Breaks the build of several !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK architectures. Cc: Jiri Olsa Cc: Steven Rostedt Message-ID: <20110217171823.GB17058@elte.hu> Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7739893a1d0a..bd1c35a4fbcc 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2163,14 +2163,10 @@ rb_reserve_next_event(struct ring_buffer *buffer, delta = diff; if (unlikely(test_time_stamp(delta))) { WARN_ONCE(delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", + KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", (unsigned long long)delta, (unsigned long long)ts, - (unsigned long long)cpu_buffer->write_stamp, - sched_clock_stable ? "" : - "If you just came from a suspend/resume,\n" - "please switch to the trace global clock:\n" - " echo global > /sys/kernel/debug/tracing/trace_clock\n"); + (unsigned long long)cpu_buffer->write_stamp); add_timestamp = 1; } } From bb3e6251a69e67d7620373ee18e35b404964273e Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 17 Feb 2011 15:47:37 +0000 Subject: [PATCH 308/706] x86: Don't call dump_stack() from arch_trigger_all_cpu_backtrace_handler() show_regs() already prints two(!) stack traces, no need for a third one. Signed-off-by: Jan Beulich LKML-Reference: <4D5D512902000078000326EE@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/hw_nmi.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 79fd43ca6f96..c4e557a1ebb6 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -83,7 +83,6 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self, arch_spin_lock(&lock); printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); show_regs(regs); - dump_stack(); arch_spin_unlock(&lock); cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); return NOTIFY_STOP; From fd8fa4d3ddc4cc04ec8097e632b995d535c52beb Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 17 Feb 2011 15:56:58 +0000 Subject: [PATCH 309/706] x86: Combine printk()s in show_regs_common() Printing a single character alone when there's an immediately following printk() is pretty pointless (and wasteful). Signed-off-by: Jan Beulich LKML-Reference: <4D5D535A0200007800032730@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/process.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ff4554198981..99fa3adf0141 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -110,12 +110,9 @@ void show_regs_common(void) init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - printk(KERN_CONT " "); - printk(KERN_CONT "%s %s", vendor, product); - if (board) { - printk(KERN_CONT "/"); - printk(KERN_CONT "%s", board); - } + printk(KERN_CONT " %s %s", vendor, product); + if (board) + printk(KERN_CONT "/%s", board); printk(KERN_CONT "\n"); } From 02ca752e4181e219e243cd61a60dd1da47251f11 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 17 Feb 2011 15:51:40 +0000 Subject: [PATCH 310/706] x86: Remove die_nmi() With no caller left, the function and the DIE_NMIWATCHDOG enumerator can both go away. Signed-off-by: Jan Beulich Cc: Peter Zijlstra Cc: Don Zickus LKML-Reference: <4D5D521C0200007800032702@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/kdebug.h | 1 - arch/x86/include/asm/nmi.h | 1 - arch/x86/kernel/dumpstack.c | 25 ------------------------- arch/x86/kernel/kgdb.c | 9 --------- 4 files changed, 36 deletions(-) diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index ca242d35e873..518bbbb9ee59 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -13,7 +13,6 @@ enum die_val { DIE_PANIC, DIE_NMI, DIE_DIE, - DIE_NMIWATCHDOG, DIE_KERNELDEBUG, DIE_TRAP, DIE_GPF, diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index c76f5b92b840..07f46016d3ff 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -7,7 +7,6 @@ #ifdef CONFIG_X86_LOCAL_APIC -extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); extern int reserve_perfctr_nmi(unsigned int); extern void release_perfctr_nmi(unsigned int); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index df20723a6a1b..220a1c11cfde 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -320,31 +320,6 @@ void die(const char *str, struct pt_regs *regs, long err) oops_end(flags, regs, sig); } -void notrace __kprobes -die_nmi(char *str, struct pt_regs *regs, int do_panic) -{ - unsigned long flags; - - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) - return; - - /* - * We are in trouble anyway, lets at least try - * to get a message out. - */ - flags = oops_begin(); - printk(KERN_EMERG "%s", str); - printk(" on CPU%d, ip %08lx, registers:\n", - smp_processor_id(), regs->ip); - show_registers(regs); - oops_end(flags, regs, 0); - if (do_panic || panic_on_oops) - panic("Non maskable interrupt"); - nmi_exit(); - local_irq_enable(); - do_exit(SIGBUS); -} - static int __init oops_setup(char *s) { if (!s) diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index a4130005028a..7c64c420a9f6 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -533,15 +533,6 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd) } return NOTIFY_DONE; - case DIE_NMIWATCHDOG: - if (atomic_read(&kgdb_active) != -1) { - /* KGDB CPU roundup: */ - kgdb_nmicallback(raw_smp_processor_id(), regs); - return NOTIFY_STOP; - } - /* Enter debugger: */ - break; - case DIE_DEBUG: if (atomic_read(&kgdb_cpu_doing_single_step) != -1) { if (user_mode(regs)) From 58bff947e2d164c7e5cbf7f485e4b3d4884befeb Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 17 Feb 2011 15:54:26 +0000 Subject: [PATCH 311/706] x86: Eliminate pointless adjustment attempts in fixup_irqs() Not only when an IRQ's affinity equals cpu_online_mask is there no need to actually try to adjust the affinity, but also when it's a subset thereof. This particularly avoids adjustment attempts during system shutdown to any IRQs bound to CPU#0. Signed-off-by: Jan Beulich Cc: Thomas Gleixner Cc: Eric W. Biederman Cc: Suresh Siddha Cc: Gary Hade LKML-Reference: <4D5D52C2020000780003272C@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 387b6a0c9e81..78793efd3180 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -310,7 +310,7 @@ void fixup_irqs(void) data = &desc->irq_data; affinity = data->affinity; if (!irq_has_action(irq) || - cpumask_equal(affinity, cpu_online_mask)) { + cpumask_subset(affinity, cpu_online_mask)) { raw_spin_unlock(&desc->lock); continue; } From 006cdc32618e09ffe228a7a86af044f3cc0dd714 Mon Sep 17 00:00:00 2001 From: Michael Witten Date: Wed, 2 Feb 2011 13:01:41 -0600 Subject: [PATCH 312/706] perf tools: Makefile: Remove vestigial git-specific cruft This commit squashes several commits that remove: NO_SYMLINK_HEAD NO_SVN_TESTS NO_FAST_WORKING_DIRECTORY USE_STDEV SHA1/SSL cruft makefile rules Signed-off-by: Michael Witten LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile | 62 +-------------------------------------------- 1 file changed, 1 insertion(+), 61 deletions(-) diff --git a/tools/perf/Makefile b/tools/perf/Makefile index 7c75f1d45f59..544367cf6d58 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -21,9 +21,6 @@ endif # Define FREAD_READS_DIRECTORIES if your are on a system which succeeds # when attempting to read from an fopen'ed directory. # -# Define NO_OPENSSL environment variable if you do not have OpenSSL. -# This also implies MOZILLA_SHA1. -# # Define CURLDIR=/foo/bar if your curl header and library files are in # /foo/bar/include and /foo/bar/lib directories. # @@ -56,13 +53,6 @@ endif # # Define NO_SYS_SELECT_H if you don't have sys/select.h. # -# Define NO_SYMLINK_HEAD if you never want .perf/HEAD to be a symbolic link. -# Enable it on Windows. By default, symrefs are still used. -# -# Define NO_SVN_TESTS if you want to skip time-consuming SVN interoperability -# tests. These tests take up a significant amount of the total test time -# but are not needed unless you plan to talk to SVN repos. -# # Define NO_FINK if you are building on Darwin/Mac OS X, have Fink # installed in /sw, but don't want PERF to link against any libraries # installed there. If defined you may specify your own (or Fink's) @@ -75,19 +65,6 @@ endif # specify your own (or DarwinPort's) include directories and # library directories by defining CFLAGS and LDFLAGS appropriately. # -# Define PPC_SHA1 environment variable when running make to make use of -# a bundled SHA1 routine optimized for PowerPC. -# -# Define ARM_SHA1 environment variable when running make to make use of -# a bundled SHA1 routine optimized for ARM. -# -# Define MOZILLA_SHA1 environment variable when running make to make use of -# a bundled SHA1 routine coming from Mozilla. It is GPL'd and should be fast -# on non-x86 architectures (e.g. PowerPC), while the OpenSSL version (default -# choice) has very fast version optimized for i586. -# -# Define NEEDS_SSL_WITH_CRYPTO if you need -lcrypto with -lssl (Darwin). -# # Define NEEDS_LIBICONV if linking with libc is not enough (Darwin). # # Define NEEDS_SOCKET if linking with libc is not enough (SunOS, @@ -100,9 +77,6 @@ endif # Define NO_PREAD if you have a problem with pread() system call (e.g. # cygwin.dll before v1.5.22). # -# Define NO_FAST_WORKING_DIRECTORY if accessing objects in pack files is -# generally faster on your platform than accessing the working directory. -# # Define NO_TRUSTABLE_FILEMODE if your filesystem may claim to support # the executable mode bit, but doesn't really do so. # @@ -134,9 +108,6 @@ endif # Define NO_NSEC if your "struct stat" does not have "st_ctim.tv_nsec" # available. This automatically turns USE_NSEC off. # -# Define USE_STDEV below if you want perf to care about the underlying device -# change being considered an inode change from the update-index perspective. -# # Define NO_ST_BLOCKS_IN_STRUCT_STAT if your platform does not have st_blocks # field that counts the on-disk footprint in 512-byte blocks. # @@ -771,9 +742,6 @@ ifdef FREAD_READS_DIRECTORIES COMPAT_CFLAGS += -DFREAD_READS_DIRECTORIES COMPAT_OBJS += $(OUTPUT)compat/fopen.o endif -ifdef NO_SYMLINK_HEAD - BASIC_CFLAGS += -DNO_SYMLINK_HEAD -endif ifdef NO_STRCASESTR COMPAT_CFLAGS += -DNO_STRCASESTR COMPAT_OBJS += $(OUTPUT)compat/strcasestr.o @@ -813,9 +781,6 @@ ifdef NO_PREAD COMPAT_CFLAGS += -DNO_PREAD COMPAT_OBJS += $(OUTPUT)compat/pread.o endif -ifdef NO_FAST_WORKING_DIRECTORY - BASIC_CFLAGS += -DNO_FAST_WORKING_DIRECTORY -endif ifdef NO_TRUSTABLE_FILEMODE BASIC_CFLAGS += -DNO_TRUSTABLE_FILEMODE endif @@ -851,23 +816,6 @@ ifdef NO_DEFLATE_BOUND BASIC_CFLAGS += -DNO_DEFLATE_BOUND endif -ifdef PPC_SHA1 - SHA1_HEADER = "ppc/sha1.h" - LIB_OBJS += $(OUTPUT)ppc/sha1.o ppc/sha1ppc.o -else -ifdef ARM_SHA1 - SHA1_HEADER = "arm/sha1.h" - LIB_OBJS += $(OUTPUT)arm/sha1.o $(OUTPUT)arm/sha1_arm.o -else -ifdef MOZILLA_SHA1 - SHA1_HEADER = "mozilla-sha1/sha1.h" - LIB_OBJS += $(OUTPUT)mozilla-sha1/sha1.o -else - SHA1_HEADER = - EXTLIBS += $(LIB_4_CRYPTO) -endif -endif -endif ifdef NO_PERL_MAKEMAKER export NO_PERL_MAKEMAKER endif @@ -930,7 +878,6 @@ endif # Shell quote (do not use $(call) to accommodate ancient setups); -SHA1_HEADER_SQ = $(subst ','\'',$(SHA1_HEADER)) ETC_PERFCONFIG_SQ = $(subst ','\'',$(ETC_PERFCONFIG)) DESTDIR_SQ = $(subst ','\'',$(DESTDIR)) @@ -948,8 +895,7 @@ PERL_PATH_SQ = $(subst ','\'',$(PERL_PATH)) LIBS = -Wl,--whole-archive $(PERFLIBS) -Wl,--no-whole-archive $(EXTLIBS) -BASIC_CFLAGS += -DSHA1_HEADER='$(SHA1_HEADER_SQ)' \ - $(COMPAT_CFLAGS) +BASIC_CFLAGS += $(COMPAT_CFLAGS) LIB_OBJS += $(COMPAT_OBJS) ALL_CFLAGS += $(BASIC_CFLAGS) @@ -1048,9 +994,6 @@ $(OUTPUT)util/exec_cmd.o: util/exec_cmd.c $(OUTPUT)PERF-CFLAGS '-DPREFIX="$(prefix_SQ)"' \ $< -$(OUTPUT)builtin-init-db.o: builtin-init-db.c $(OUTPUT)PERF-CFLAGS - $(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DDEFAULT_PERF_TEMPLATE_DIR='"$(template_dir_SQ)"' $< - $(OUTPUT)util/config.o: util/config.c $(OUTPUT)PERF-CFLAGS $(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $< @@ -1089,7 +1032,6 @@ $(OUTPUT)perf-%$X: %.o $(PERFLIBS) $(LIB_OBJS) $(BUILTIN_OBJS): $(LIB_H) $(patsubst perf-%$X,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h) -builtin-revert.o wt-status.o: wt-status.h # we compile into subdirectories. if the target directory is not the source directory, they might not exists. So # we depend the various files onto their directories. @@ -1192,8 +1134,6 @@ all:: $(TEST_PROGRAMS) # However, the environment gets quite big, and some programs have problems # with that. -export NO_SVN_TESTS - check: $(OUTPUT)common-cmds.h if sparse; \ then \ From 8796cb9d7dc028945af4b2ea858ae8f8f2ecbe8c Mon Sep 17 00:00:00 2001 From: Michael Witten Date: Wed, 2 Feb 2011 11:57:41 -0600 Subject: [PATCH 313/706] perf tools: Makefile: Remove platform-specific cruft While it makes sense that this tool could be used on other platforms at least to parse data, there doesn't appear to be any real support for such usage. This commit squashes several commits that remove: SNPRINTF_RETURNS_BOGUS FREAD_READS_DIRECTORIES NO_D_{INO,TYPE}_IN_DIRENT NO_STRCASESTR NO_MEMMEM NO_STRTOUMAX and NO_STRTOULL NO_SETENV NO_UNSETENV NO_MKDTEMP NEEDS_LIBICONV NEEDS_SOCKET NO_MMAP NO_PTHREADS NO_PREAD NO_TRUSTABLE_FILEMODE NO_IPV6 and NO_SOCKADDR_STORAGE NO_ICONV and OLD_ICONV NO_NSEC, USE_NSEC, and USE_ST_TIMESPEC NO_ST_BLOCKS_IN_STRUCT_STAT NO_FINK and NO_DARWIN_PORTS NO_SYS_SELECT_H NO_HSTRERROR DIR_HAS_BSD_GROUP_SEMANTICS and FORCE_DIR_SET_GID NEEDS_NSL, NO_UINTMAX_T, NO_INET_{N,P}TON COMPAT_{CFLAGS,OBJS} Executable extension `X' Signed-off-by: Michael Witten LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile | 232 ++--------------------------------------- tools/perf/util/util.h | 26 ----- 2 files changed, 8 insertions(+), 250 deletions(-) diff --git a/tools/perf/Makefile b/tools/perf/Makefile index 544367cf6d58..53c1e93ac432 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -14,103 +14,23 @@ endif # Define V=1 to have a more verbose compile. # Define V=2 to have an even more verbose compile. # -# Define SNPRINTF_RETURNS_BOGUS if your are on a system which snprintf() -# or vsnprintf() return -1 instead of number of characters which would -# have been written to the final string if enough space had been available. -# -# Define FREAD_READS_DIRECTORIES if your are on a system which succeeds -# when attempting to read from an fopen'ed directory. -# # Define CURLDIR=/foo/bar if your curl header and library files are in # /foo/bar/include and /foo/bar/lib directories. # # Define EXPATDIR=/foo/bar if your expat header and library files are in # /foo/bar/include and /foo/bar/lib directories. # -# Define NO_D_INO_IN_DIRENT if you don't have d_ino in your struct dirent. -# -# Define NO_D_TYPE_IN_DIRENT if your platform defines DT_UNKNOWN but lacks -# d_type in struct dirent (latest Cygwin -- will be fixed soonish). -# # Define NO_C99_FORMAT if your formatted IO functions (printf/scanf et.al.) # do not support the 'size specifiers' introduced by C99, namely ll, hh, # j, z, t. (representing long long int, char, intmax_t, size_t, ptrdiff_t). # some C compilers supported these specifiers prior to C99 as an extension. # -# Define NO_STRCASESTR if you don't have strcasestr. -# -# Define NO_MEMMEM if you don't have memmem. -# -# Define NO_STRTOUMAX if you don't have strtoumax in the C library. -# If your compiler also does not support long long or does not have -# strtoull, define NO_STRTOULL. -# -# Define NO_SETENV if you don't have setenv in the C library. -# -# Define NO_UNSETENV if you don't have unsetenv in the C library. -# -# Define NO_MKDTEMP if you don't have mkdtemp in the C library. -# -# Define NO_SYS_SELECT_H if you don't have sys/select.h. -# -# Define NO_FINK if you are building on Darwin/Mac OS X, have Fink -# installed in /sw, but don't want PERF to link against any libraries -# installed there. If defined you may specify your own (or Fink's) -# include directories and library directories by defining CFLAGS -# and LDFLAGS appropriately. -# -# Define NO_DARWIN_PORTS if you are building on Darwin/Mac OS X, -# have DarwinPorts installed in /opt/local, but don't want PERF to -# link against any libraries installed there. If defined you may -# specify your own (or DarwinPort's) include directories and -# library directories by defining CFLAGS and LDFLAGS appropriately. -# -# Define NEEDS_LIBICONV if linking with libc is not enough (Darwin). -# -# Define NEEDS_SOCKET if linking with libc is not enough (SunOS, -# Patrick Mauritz). -# -# Define NO_MMAP if you want to avoid mmap. -# -# Define NO_PTHREADS if you do not have or do not want to use Pthreads. -# -# Define NO_PREAD if you have a problem with pread() system call (e.g. -# cygwin.dll before v1.5.22). -# -# Define NO_TRUSTABLE_FILEMODE if your filesystem may claim to support -# the executable mode bit, but doesn't really do so. -# -# Define NO_IPV6 if you lack IPv6 support and getaddrinfo(). -# -# Define NO_SOCKADDR_STORAGE if your platform does not have struct -# sockaddr_storage. -# -# Define NO_ICONV if your libc does not properly support iconv. -# -# Define OLD_ICONV if your library has an old iconv(), where the second -# (input buffer pointer) parameter is declared with type (const char **). -# # Define NO_DEFLATE_BOUND if your zlib does not have deflateBound. # # Define NO_R_TO_GCC_LINKER if your gcc does not like "-R/path/lib" # that tells runtime paths to dynamic libraries; # "-Wl,-rpath=/path/lib" is used instead. # -# Define USE_NSEC below if you want perf to care about sub-second file mtimes -# and ctimes. Note that you need recent glibc (at least 2.2.4) for this, and -# it will BREAK YOUR LOCAL DIFFS! show-diff and anything using it will likely -# randomly break unless your underlying filesystem supports those sub-second -# times (my ext3 doesn't). -# -# Define USE_ST_TIMESPEC if your "struct stat" uses "st_ctimespec" instead of -# "st_ctim" -# -# Define NO_NSEC if your "struct stat" does not have "st_ctim.tv_nsec" -# available. This automatically turns USE_NSEC off. -# -# Define NO_ST_BLOCKS_IN_STRUCT_STAT if your platform does not have st_blocks -# field that counts the on-disk footprint in 512-byte blocks. -# # Define ASCIIDOC8 if you want to format documentation with AsciiDoc 8 # # Define DOCBOOK_XSL_172 if you want to format man pages with DocBook XSL v1.72. @@ -282,8 +202,6 @@ BASIC_LDFLAGS = # Guard against environment variables BUILTIN_OBJS = BUILT_INS = -COMPAT_CFLAGS = -COMPAT_OBJS = LIB_H = LIB_OBJS = PYRF_OBJS = @@ -329,7 +247,7 @@ LANG_BINDINGS = ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS) # what 'all' will build but not install in perfexecdir -OTHER_PROGRAMS = $(OUTPUT)perf$X +OTHER_PROGRAMS = $(OUTPUT)perf # Set paths to tools early so that they can be used for version tests. ifndef SHELL_PATH @@ -538,22 +456,6 @@ endif # NO_DWARF -include arch/$(ARCH)/Makefile -ifeq ($(uname_S),Darwin) - ifndef NO_FINK - ifeq ($(shell test -d /sw/lib && echo y),y) - BASIC_CFLAGS += -I/sw/include - BASIC_LDFLAGS += -L/sw/lib - endif - endif - ifndef NO_DARWIN_PORTS - ifeq ($(shell test -d /opt/local/lib && echo y),y) - BASIC_CFLAGS += -I/opt/local/include - BASIC_LDFLAGS += -L/opt/local/lib - endif - endif - PTHREAD_LIBS = -endif - ifneq ($(OUTPUT),) BASIC_CFLAGS += -I$(OUTPUT) endif @@ -707,110 +609,9 @@ ifndef CC_LD_DYNPATH endif endif -ifdef NEEDS_SOCKET - EXTLIBS += -lsocket -endif -ifdef NEEDS_NSL - EXTLIBS += -lnsl -endif -ifdef NO_D_TYPE_IN_DIRENT - BASIC_CFLAGS += -DNO_D_TYPE_IN_DIRENT -endif -ifdef NO_D_INO_IN_DIRENT - BASIC_CFLAGS += -DNO_D_INO_IN_DIRENT -endif -ifdef NO_ST_BLOCKS_IN_STRUCT_STAT - BASIC_CFLAGS += -DNO_ST_BLOCKS_IN_STRUCT_STAT -endif -ifdef USE_NSEC - BASIC_CFLAGS += -DUSE_NSEC -endif -ifdef USE_ST_TIMESPEC - BASIC_CFLAGS += -DUSE_ST_TIMESPEC -endif -ifdef NO_NSEC - BASIC_CFLAGS += -DNO_NSEC -endif ifdef NO_C99_FORMAT BASIC_CFLAGS += -DNO_C99_FORMAT endif -ifdef SNPRINTF_RETURNS_BOGUS - COMPAT_CFLAGS += -DSNPRINTF_RETURNS_BOGUS - COMPAT_OBJS += $(OUTPUT)compat/snprintf.o -endif -ifdef FREAD_READS_DIRECTORIES - COMPAT_CFLAGS += -DFREAD_READS_DIRECTORIES - COMPAT_OBJS += $(OUTPUT)compat/fopen.o -endif -ifdef NO_STRCASESTR - COMPAT_CFLAGS += -DNO_STRCASESTR - COMPAT_OBJS += $(OUTPUT)compat/strcasestr.o -endif -ifdef NO_STRTOUMAX - COMPAT_CFLAGS += -DNO_STRTOUMAX - COMPAT_OBJS += $(OUTPUT)compat/strtoumax.o -endif -ifdef NO_STRTOULL - COMPAT_CFLAGS += -DNO_STRTOULL -endif -ifdef NO_SETENV - COMPAT_CFLAGS += -DNO_SETENV - COMPAT_OBJS += $(OUTPUT)compat/setenv.o -endif -ifdef NO_MKDTEMP - COMPAT_CFLAGS += -DNO_MKDTEMP - COMPAT_OBJS += $(OUTPUT)compat/mkdtemp.o -endif -ifdef NO_UNSETENV - COMPAT_CFLAGS += -DNO_UNSETENV - COMPAT_OBJS += $(OUTPUT)compat/unsetenv.o -endif -ifdef NO_SYS_SELECT_H - BASIC_CFLAGS += -DNO_SYS_SELECT_H -endif -ifdef NO_MMAP - COMPAT_CFLAGS += -DNO_MMAP - COMPAT_OBJS += $(OUTPUT)compat/mmap.o -else - ifdef USE_WIN32_MMAP - COMPAT_CFLAGS += -DUSE_WIN32_MMAP - COMPAT_OBJS += $(OUTPUT)compat/win32mmap.o - endif -endif -ifdef NO_PREAD - COMPAT_CFLAGS += -DNO_PREAD - COMPAT_OBJS += $(OUTPUT)compat/pread.o -endif -ifdef NO_TRUSTABLE_FILEMODE - BASIC_CFLAGS += -DNO_TRUSTABLE_FILEMODE -endif -ifdef NO_IPV6 - BASIC_CFLAGS += -DNO_IPV6 -endif -ifdef NO_UINTMAX_T - BASIC_CFLAGS += -Duintmax_t=uint32_t -endif -ifdef NO_SOCKADDR_STORAGE -ifdef NO_IPV6 - BASIC_CFLAGS += -Dsockaddr_storage=sockaddr_in -else - BASIC_CFLAGS += -Dsockaddr_storage=sockaddr_in6 -endif -endif -ifdef NO_INET_NTOP - LIB_OBJS += $(OUTPUT)compat/inet_ntop.o -endif -ifdef NO_INET_PTON - LIB_OBJS += $(OUTPUT)compat/inet_pton.o -endif - -ifdef NO_ICONV - BASIC_CFLAGS += -DNO_ICONV -endif - -ifdef OLD_ICONV - BASIC_CFLAGS += -DOLD_ICONV -endif ifdef NO_DEFLATE_BOUND BASIC_CFLAGS += -DNO_DEFLATE_BOUND @@ -819,14 +620,6 @@ endif ifdef NO_PERL_MAKEMAKER export NO_PERL_MAKEMAKER endif -ifdef NO_HSTRERROR - COMPAT_CFLAGS += -DNO_HSTRERROR - COMPAT_OBJS += $(OUTPUT)compat/hstrerror.o -endif -ifdef NO_MEMMEM - COMPAT_CFLAGS += -DNO_MEMMEM - COMPAT_OBJS += $(OUTPUT)compat/memmem.o -endif ifdef INTERNAL_QSORT COMPAT_CFLAGS += -DINTERNAL_QSORT COMPAT_OBJS += $(OUTPUT)compat/qsort.o @@ -835,9 +628,6 @@ ifdef RUNTIME_PREFIX COMPAT_CFLAGS += -DRUNTIME_PREFIX endif -ifdef DIR_HAS_BSD_GROUP_SEMANTICS - COMPAT_CFLAGS += -DDIR_HAS_BSD_GROUP_SEMANTICS -endif ifdef NO_EXTERNAL_GREP BASIC_CFLAGS += -DNO_EXTERNAL_GREP endif @@ -895,9 +685,6 @@ PERL_PATH_SQ = $(subst ','\'',$(PERL_PATH)) LIBS = -Wl,--whole-archive $(PERFLIBS) -Wl,--no-whole-archive $(EXTLIBS) -BASIC_CFLAGS += $(COMPAT_CFLAGS) -LIB_OBJS += $(COMPAT_OBJS) - ALL_CFLAGS += $(BASIC_CFLAGS) ALL_CFLAGS += $(ARCH_CFLAGS) ALL_LDFLAGS += $(BASIC_LDFLAGS) @@ -910,9 +697,6 @@ export TAR INSTALL DESTDIR SHELL_PATH SHELL = $(SHELL_PATH) all:: shell_compatibility_test $(ALL_PROGRAMS) $(LANG_BINDINGS) $(BUILT_INS) $(OTHER_PROGRAMS) $(OUTPUT)PERF-BUILD-OPTIONS -ifneq (,$X) - $(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), test '$p' -ef '$p$X' || $(RM) '$p';) -endif all:: @@ -921,15 +705,15 @@ please_set_SHELL_PATH_to_a_more_modern_shell: shell_compatibility_test: please_set_SHELL_PATH_to_a_more_modern_shell -strip: $(PROGRAMS) $(OUTPUT)perf$X - $(STRIP) $(STRIP_OPTS) $(PROGRAMS) $(OUTPUT)perf$X +strip: $(PROGRAMS) $(OUTPUT)perf + $(STRIP) $(STRIP_OPTS) $(PROGRAMS) $(OUTPUT)perf $(OUTPUT)perf.o: perf.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS $(QUIET_CC)$(CC) -DPERF_VERSION='"$(PERF_VERSION)"' \ '-DPERF_HTML_PATH="$(htmldir_SQ)"' \ $(ALL_CFLAGS) -c $(filter %.c,$^) -o $@ -$(OUTPUT)perf$X: $(OUTPUT)perf.o $(BUILTIN_OBJS) $(PERFLIBS) +$(OUTPUT)perf: $(OUTPUT)perf.o $(BUILTIN_OBJS) $(PERFLIBS) $(QUIET_LINK)$(CC) $(ALL_CFLAGS) $(ALL_LDFLAGS) $(OUTPUT)perf.o \ $(BUILTIN_OBJS) $(LIBS) -o $@ @@ -1027,11 +811,11 @@ $(OUTPUT)util/scripting-engines/trace-event-python.o: util/scripting-engines/tra $(OUTPUT)scripts/python/Perf-Trace-Util/Context.o: scripts/python/Perf-Trace-Util/Context.c $(OUTPUT)PERF-CFLAGS $(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-nested-externs $< -$(OUTPUT)perf-%$X: %.o $(PERFLIBS) +$(OUTPUT)perf-%: %.o $(PERFLIBS) $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(LIBS) $(LIB_OBJS) $(BUILTIN_OBJS): $(LIB_H) -$(patsubst perf-%$X,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h) +$(patsubst perf-%,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h) # we compile into subdirectories. if the target directory is not the source directory, they might not exists. So # we depend the various files onto their directories. @@ -1168,7 +952,7 @@ export perfexec_instdir install: all $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)' - $(INSTALL) $(OUTPUT)perf$X '$(DESTDIR_SQ)$(bindir_SQ)' + $(INSTALL) $(OUTPUT)perf '$(DESTDIR_SQ)$(bindir_SQ)' $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/Perf-Trace-Util/lib/Perf/Trace' $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/bin' $(INSTALL) $(OUTPUT)perf-archive -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)' @@ -1267,7 +1051,7 @@ distclean: clean clean: $(RM) $(OUTPUT){*.o,*/*.o,*/*/*.o,*/*/*/*.o,$(LIB_FILE),perf-archive} - $(RM) $(ALL_PROGRAMS) $(BUILT_INS) perf$X + $(RM) $(ALL_PROGRAMS) $(BUILT_INS) perf $(RM) $(TEST_PROGRAMS) $(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo $(OUTPUT)common-cmds.h TAGS tags cscope* $(RM) -r $(PERF_TARNAME) .doc-tmp-dir diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index e833f26f3bfc..fc784284ac8b 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -70,9 +70,7 @@ #include #include #include -#ifndef NO_SYS_SELECT_H #include -#endif #include #include #include @@ -83,10 +81,6 @@ #include "types.h" #include -#ifndef NO_ICONV -#include -#endif - extern const char *graph_line; extern const char *graph_dotted_line; extern char buildid_dir[]; @@ -236,26 +230,6 @@ static inline int sane_case(int x, int high) return x; } -#ifndef DIR_HAS_BSD_GROUP_SEMANTICS -# define FORCE_DIR_SET_GID S_ISGID -#else -# define FORCE_DIR_SET_GID 0 -#endif - -#ifdef NO_NSEC -#undef USE_NSEC -#define ST_CTIME_NSEC(st) 0 -#define ST_MTIME_NSEC(st) 0 -#else -#ifdef USE_ST_TIMESPEC -#define ST_CTIME_NSEC(st) ((unsigned int)((st).st_ctimespec.tv_nsec)) -#define ST_MTIME_NSEC(st) ((unsigned int)((st).st_mtimespec.tv_nsec)) -#else -#define ST_CTIME_NSEC(st) ((unsigned int)((st).st_ctim.tv_nsec)) -#define ST_MTIME_NSEC(st) ((unsigned int)((st).st_mtim.tv_nsec)) -#endif -#endif - int mkdir_p(char *path, mode_t mode); int copyfile(const char *from, const char *to); From 0a54fb63600b745e060d24879ed5194382a466c5 Mon Sep 17 00:00:00 2001 From: Michael Witten Date: Wed, 2 Feb 2011 12:04:27 -0600 Subject: [PATCH 314/706] perf tools: Makefile: Remove tool-specific cruft This commit squashes several commits that remove: NO_C99_FORMAT CURLDIR and EXPATDIR NO_DEFLATE_BOUND CC_LD_DYNPATH and NO_R_TO_GCC_LINKER NO_PERL_MAKEMAKER INTERNAL_QSORT NO_EXTERNAL_GREP NO_PERL SCRIPT_PERL PERL_PATH_SQ Signed-off-by: Michael Witten LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile | 73 +-------------------------------------------- 1 file changed, 1 insertion(+), 72 deletions(-) diff --git a/tools/perf/Makefile b/tools/perf/Makefile index 53c1e93ac432..fde196fe9366 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -14,40 +14,10 @@ endif # Define V=1 to have a more verbose compile. # Define V=2 to have an even more verbose compile. # -# Define CURLDIR=/foo/bar if your curl header and library files are in -# /foo/bar/include and /foo/bar/lib directories. -# -# Define EXPATDIR=/foo/bar if your expat header and library files are in -# /foo/bar/include and /foo/bar/lib directories. -# -# Define NO_C99_FORMAT if your formatted IO functions (printf/scanf et.al.) -# do not support the 'size specifiers' introduced by C99, namely ll, hh, -# j, z, t. (representing long long int, char, intmax_t, size_t, ptrdiff_t). -# some C compilers supported these specifiers prior to C99 as an extension. -# -# Define NO_DEFLATE_BOUND if your zlib does not have deflateBound. -# -# Define NO_R_TO_GCC_LINKER if your gcc does not like "-R/path/lib" -# that tells runtime paths to dynamic libraries; -# "-Wl,-rpath=/path/lib" is used instead. -# # Define ASCIIDOC8 if you want to format documentation with AsciiDoc 8 # # Define DOCBOOK_XSL_172 if you want to format man pages with DocBook XSL v1.72. # -# Define NO_PERL_MAKEMAKER if you cannot use Makefiles generated by perl's -# MakeMaker (e.g. using ActiveState under Cygwin). -# -# Define NO_PERL if you do not want Perl scripts or libraries at all. -# -# Define INTERNAL_QSORT to use Git's implementation of qsort(), which -# is a simplified version of the merge sort used in glibc. This is -# recommended if Git triggers O(n^2) behavior in your platform's qsort(). -# -# Define NO_EXTERNAL_GREP if you don't want "perf grep" to ever call -# your external grep (e.g., if your system lacks grep, if its grep is -# broken, or spawning external process is slower than built-in grep perf has). -# # Define LDFLAGS=-static to build a static binary. # # Define EXTRA_CFLAGS=-m64 or EXTRA_CFLAGS=-m32 as appropriate for cross-builds. @@ -205,7 +175,6 @@ BUILT_INS = LIB_H = LIB_OBJS = PYRF_OBJS = -SCRIPT_PERL = SCRIPT_SH = TEST_PROGRAMS = @@ -221,10 +190,7 @@ $(OUTPUT)python/perf.so: $(PYRF_OBJS) # No Perl scripts right now: # -# SCRIPT_PERL += perf-add--interactive.perl - -SCRIPTS = $(patsubst %.sh,%,$(SCRIPT_SH)) \ - $(patsubst %.perl,%,$(SCRIPT_PERL)) +SCRIPTS = $(patsubst %.sh,%,$(SCRIPT_SH)) # Empty... EXTRA_PROGRAMS = @@ -599,43 +565,10 @@ else endif endif -ifndef CC_LD_DYNPATH - ifdef NO_R_TO_GCC_LINKER - # Some gcc does not accept and pass -R to the linker to specify - # the runtime dynamic library path. - CC_LD_DYNPATH = -Wl,-rpath, - else - CC_LD_DYNPATH = -R - endif -endif - -ifdef NO_C99_FORMAT - BASIC_CFLAGS += -DNO_C99_FORMAT -endif - -ifdef NO_DEFLATE_BOUND - BASIC_CFLAGS += -DNO_DEFLATE_BOUND -endif - -ifdef NO_PERL_MAKEMAKER - export NO_PERL_MAKEMAKER -endif -ifdef INTERNAL_QSORT - COMPAT_CFLAGS += -DINTERNAL_QSORT - COMPAT_OBJS += $(OUTPUT)compat/qsort.o -endif ifdef RUNTIME_PREFIX COMPAT_CFLAGS += -DRUNTIME_PREFIX endif -ifdef NO_EXTERNAL_GREP - BASIC_CFLAGS += -DNO_EXTERNAL_GREP -endif - -ifeq ($(PERL_PATH),) -NO_PERL=NoThanks -endif - QUIET_SUBDIR0 = +$(MAKE) -C # space to separate -C and subdir QUIET_SUBDIR1 = @@ -681,7 +614,6 @@ htmldir_SQ = $(subst ','\'',$(htmldir)) prefix_SQ = $(subst ','\'',$(prefix)) SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH)) -PERL_PATH_SQ = $(subst ','\'',$(PERL_PATH)) LIBS = -Wl,--whole-archive $(PERFLIBS) -Wl,--no-whole-archive $(EXTLIBS) @@ -744,7 +676,6 @@ $(patsubst %.sh,%,$(SCRIPT_SH)) : % : %.sh $(QUIET_GEN)$(RM) $(OUTPUT)$@ $(OUTPUT)$@+ && \ sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \ -e 's|@SHELL_PATH@|$(SHELL_PATH_SQ)|' \ - -e 's|@@PERL@@|$(PERL_PATH_SQ)|g' \ -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \ -e 's/@@NO_CURL@@/$(NO_CURL)/g' \ $@.sh > $(OUTPUT)$@+ && \ @@ -761,7 +692,6 @@ configure: configure.ac # These can record PERF_VERSION $(OUTPUT)perf.o perf.spec \ $(patsubst %.sh,%,$(SCRIPT_SH)) \ - $(patsubst %.perl,%,$(SCRIPT_PERL)) \ : $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)%.o: %.c $(OUTPUT)PERF-CFLAGS @@ -903,7 +833,6 @@ $(OUTPUT)PERF-BUILD-OPTIONS: .FORCE-PERF-BUILD-OPTIONS @echo SHELL_PATH=\''$(subst ','\'',$(SHELL_PATH_SQ))'\' >$@ @echo TAR=\''$(subst ','\'',$(subst ','\'',$(TAR)))'\' >>$@ @echo NO_CURL=\''$(subst ','\'',$(subst ','\'',$(NO_CURL)))'\' >>$@ - @echo NO_PERL=\''$(subst ','\'',$(subst ','\'',$(NO_PERL)))'\' >>$@ ### Testing rules From a3d1ee10d1bf4520af3d44c1aa6cd46956ec4fd7 Mon Sep 17 00:00:00 2001 From: Michael Witten Date: Wed, 2 Feb 2011 14:22:08 -0600 Subject: [PATCH 315/706] perf tools: Makefile: Remove various and sundry cruft This commit squashes several commits that remove: unnecessary uname calls `sh -c' BUILT_INS and QUIET_BUILT_IN They have no effect, and the `fixup-builtins' and `check-builtins.sh' scripts don't even exist. RUNTIME_PREFIX It's currently never anything but unset, and it's apparently only meaningful when Microsoft Windows is the operating system (according to the source for git). TEST_PROGRAMS EXTRA_PROGRAMS unused SHELL_PATH_SQ portions unused test for V=2 useless exports Only when `V' is undefined (that is, only when the value of `V' is empty) is `export V' performed, which just has the effect of placing the empty-valued variable `V' in the environment. The only other script to make use of `V' is `Documentation/Makefile', which only checks whether `V' is undefined (that is, whether the value of `V' is empty); hence, the `export V' has no effect whatsoever. Similarly, `export QUIET_GEN' is useless because it will only have a non-empty value when `V' has an empty-value, and when `V' has an empty-value, `QUIET_GEN' is always explicitly set in every script in which it is used. `DESTDIR' is only ever defined by the user via the environment or the command line, both of which are automatically exported to sub-make processes. Furthermore, no non-make sub-scripts make use of `DESTDIR' as an environment variable. No other scripts use `perfexec_instdir'. unused QUIET_SUBDIR{0,1} TAR and RPMBUILD PTHREAD_LIBS Maintainer's dist rules and commands distclean target Test suite coverage testing PRINT_DIR and NO_SUBDIR `configure' target NO_CURL @@PERF_VERSION@@ substitution Without the sed command, all of the rule's commands can be reduced to a single line that copies a file and sets the permissions properly in the process. `make test' echo line template_instdir PERF-BUILD-OPTIONS double-colon rules The use of double-colon rules seems misguided or vestigial git. Essentially hard-coded $(SCRIPTS) expansion Signed-off-by: Michael Witten LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/.gitignore | 1 - tools/perf/Makefile | 213 ++----------------------------------- tools/perf/util/exec_cmd.c | 19 ---- 3 files changed, 10 insertions(+), 223 deletions(-) diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore index cb43289e447f..416684be0ad3 100644 --- a/tools/perf/.gitignore +++ b/tools/perf/.gitignore @@ -1,4 +1,3 @@ -PERF-BUILD-OPTIONS PERF-CFLAGS PERF-GUI-VARS PERF-VERSION-FILE diff --git a/tools/perf/Makefile b/tools/perf/Makefile index fde196fe9366..9b8421805c5c 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -3,7 +3,7 @@ ifeq ("$(origin O)", "command line") endif # The default target of this Makefile is... -all:: +all: ifneq ($(OUTPUT),) # check that the output directory actually exists @@ -11,8 +11,7 @@ OUTDIR := $(shell cd $(OUTPUT) && /bin/pwd) $(if $(OUTDIR),, $(error output directory "$(OUTPUT)" does not exist)) endif -# Define V=1 to have a more verbose compile. -# Define V=2 to have an even more verbose compile. +# Define V to have a more verbose compile. # # Define ASCIIDOC8 if you want to format documentation with AsciiDoc 8 # @@ -28,12 +27,7 @@ $(OUTPUT)PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE @$(SHELL_PATH) util/PERF-VERSION-GEN $(OUTPUT) -include $(OUTPUT)PERF-VERSION-FILE -uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not') -uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not') -uname_O := $(shell sh -c 'uname -o 2>/dev/null || echo not') -uname_R := $(shell sh -c 'uname -r 2>/dev/null || echo not') -uname_P := $(shell sh -c 'uname -p 2>/dev/null || echo not') -uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not') +uname_M := $(shell uname -m 2>/dev/null || echo not) ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ \ -e s/arm.*/arm/ -e s/sa110/arm/ \ @@ -52,8 +46,6 @@ ifeq ($(ARCH),x86_64) ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S endif -# CFLAGS and LDFLAGS are for the users to override from the command line. - # # Include saner warnings here, which can catch bugs: # @@ -131,22 +123,13 @@ CC = $(CROSS_COMPILE)gcc AR = $(CROSS_COMPILE)ar RM = rm -f MKDIR = mkdir -TAR = tar FIND = find INSTALL = install -RPMBUILD = rpmbuild -PTHREAD_LIBS = -lpthread # sparse is architecture-neutral, which means that we need to tell it # explicitly what architecture to check for. Fix this up for yours.. SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__ -ifeq ($(V), 2) - QUIET_STDERR = ">/dev/null" -else - QUIET_STDERR = ">/dev/null 2>&1" -endif - -include feature-tests.mak ifeq ($(call try-cc,$(SOURCE_HELLO),-Werror -fstack-protector-all),y) @@ -171,12 +154,10 @@ BASIC_LDFLAGS = # Guard against environment variables BUILTIN_OBJS = -BUILT_INS = LIB_H = LIB_OBJS = PYRF_OBJS = SCRIPT_SH = -TEST_PROGRAMS = SCRIPT_SH += perf-archive.sh @@ -192,12 +173,6 @@ $(OUTPUT)python/perf.so: $(PYRF_OBJS) SCRIPTS = $(patsubst %.sh,%,$(SCRIPT_SH)) -# Empty... -EXTRA_PROGRAMS = - -# ... and all the rest that could be moved out of bindir to perfexecdir -PROGRAMS += $(EXTRA_PROGRAMS) - # # Single 'perf' binary right now: # @@ -205,10 +180,6 @@ PROGRAMS += $(OUTPUT)perf LANG_BINDINGS = -# List built-in command $C whose implementation cmd_$C() is not in -# builtin-$C.o but is linked in as part of some other command. -# - # what 'all' will build and 'install' will install, in perfexecdir ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS) @@ -565,33 +536,13 @@ else endif endif -ifdef RUNTIME_PREFIX - COMPAT_CFLAGS += -DRUNTIME_PREFIX -endif - -QUIET_SUBDIR0 = +$(MAKE) -C # space to separate -C and subdir -QUIET_SUBDIR1 = - -ifneq ($(findstring $(MAKEFLAGS),w),w) -PRINT_DIR = --no-print-directory -else # "make -w" -NO_SUBDIR = : -endif - ifneq ($(findstring $(MAKEFLAGS),s),s) ifndef V QUIET_CC = @echo ' ' CC $@; QUIET_AR = @echo ' ' AR $@; QUIET_LINK = @echo ' ' LINK $@; QUIET_MKDIR = @echo ' ' MKDIR $@; - QUIET_BUILT_IN = @echo ' ' BUILTIN $@; QUIET_GEN = @echo ' ' GEN $@; - QUIET_SUBDIR0 = +@subdir= - QUIET_SUBDIR1 = ;$(NO_SUBDIR) echo ' ' SUBDIR $$subdir; \ - $(MAKE) $(PRINT_DIR) -C $$subdir - export V - export QUIET_GEN - export QUIET_BUILT_IN endif endif @@ -621,16 +572,14 @@ ALL_CFLAGS += $(BASIC_CFLAGS) ALL_CFLAGS += $(ARCH_CFLAGS) ALL_LDFLAGS += $(BASIC_LDFLAGS) -export TAR INSTALL DESTDIR SHELL_PATH +export INSTALL SHELL_PATH ### Build rules SHELL = $(SHELL_PATH) -all:: shell_compatibility_test $(ALL_PROGRAMS) $(LANG_BINDINGS) $(BUILT_INS) $(OTHER_PROGRAMS) $(OUTPUT)PERF-BUILD-OPTIONS - -all:: +all: shell_compatibility_test $(ALL_PROGRAMS) $(LANG_BINDINGS) $(OTHER_PROGRAMS) please_set_SHELL_PATH_to_a_more_modern_shell: @$$(:) @@ -661,37 +610,17 @@ $(OUTPUT)builtin-timechart.o: builtin-timechart.c $(OUTPUT)common-cmds.h $(OUTPU '-DPERF_MAN_PATH="$(mandir_SQ)"' \ '-DPERF_INFO_PATH="$(infodir_SQ)"' $< -$(BUILT_INS): $(OUTPUT)perf$X - $(QUIET_BUILT_IN)$(RM) $@ && \ - ln perf$X $@ 2>/dev/null || \ - ln -s perf$X $@ 2>/dev/null || \ - cp perf$X $@ - $(OUTPUT)common-cmds.h: util/generate-cmdlist.sh command-list.txt $(OUTPUT)common-cmds.h: $(wildcard Documentation/perf-*.txt) $(QUIET_GEN). util/generate-cmdlist.sh > $@+ && mv $@+ $@ -$(patsubst %.sh,%,$(SCRIPT_SH)) : % : %.sh - $(QUIET_GEN)$(RM) $(OUTPUT)$@ $(OUTPUT)$@+ && \ - sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \ - -e 's|@SHELL_PATH@|$(SHELL_PATH_SQ)|' \ - -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \ - -e 's/@@NO_CURL@@/$(NO_CURL)/g' \ - $@.sh > $(OUTPUT)$@+ && \ - chmod +x $(OUTPUT)$@+ && \ - mv $(OUTPUT)$@+ $(OUTPUT)$@ - -configure: configure.ac - $(QUIET_GEN)$(RM) $@ $<+ && \ - sed -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \ - $< > $<+ && \ - autoconf -o $@ $<+ && \ - $(RM) $<+ +$(SCRIPTS) : % : %.sh + $(QUIET_GEN)$(INSTALL) '$@.sh' '$(OUTPUT)$@' # These can record PERF_VERSION $(OUTPUT)perf.o perf.spec \ - $(patsubst %.sh,%,$(SCRIPT_SH)) \ + $(SCRIPTS) \ : $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)%.o: %.c $(OUTPUT)PERF-CFLAGS @@ -826,23 +755,8 @@ $(OUTPUT)PERF-CFLAGS: .FORCE-PERF-CFLAGS echo "$$FLAGS" >$(OUTPUT)PERF-CFLAGS; \ fi -# We need to apply sq twice, once to protect from the shell -# that runs $(OUTPUT)PERF-BUILD-OPTIONS, and then again to protect it -# and the first level quoting from the shell that runs "echo". -$(OUTPUT)PERF-BUILD-OPTIONS: .FORCE-PERF-BUILD-OPTIONS - @echo SHELL_PATH=\''$(subst ','\'',$(SHELL_PATH_SQ))'\' >$@ - @echo TAR=\''$(subst ','\'',$(subst ','\'',$(TAR)))'\' >>$@ - @echo NO_CURL=\''$(subst ','\'',$(subst ','\'',$(NO_CURL)))'\' >>$@ - ### Testing rules -# -# None right now: -# -# TEST_PROGRAMS += test-something$X - -all:: $(TEST_PROGRAMS) - # GNU make supports exporting all variables by "export" without parameters. # However, the environment gets quite big, and some programs have problems # with that. @@ -855,29 +769,17 @@ check: $(OUTPUT)common-cmds.h sparse $(ALL_CFLAGS) $(SPARSE_FLAGS) $$i || exit; \ done; \ else \ - echo 2>&1 "Did you mean 'make test'?"; \ exit 1; \ fi -remove-dashes: - ./fixup-builtins $(BUILT_INS) $(PROGRAMS) $(SCRIPTS) - ### Installation rules -ifneq ($(filter /%,$(firstword $(template_dir))),) -template_instdir = $(template_dir) -else -template_instdir = $(prefix)/$(template_dir) -endif -export template_instdir - ifneq ($(filter /%,$(firstword $(perfexecdir))),) perfexec_instdir = $(perfexecdir) else perfexec_instdir = $(prefix)/$(perfexecdir) endif perfexec_instdir_SQ = $(subst ','\'',$(perfexec_instdir)) -export perfexec_instdir install: all $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)' @@ -894,14 +796,6 @@ install: all $(INSTALL) scripts/python/*.py -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python' $(INSTALL) scripts/python/bin/* -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python/bin' -ifdef BUILT_INS - $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)' - $(INSTALL) $(BUILT_INS) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)' -ifneq (,$X) - $(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) $(OUTPUT)perf$X)), $(RM) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$p';) -endif -endif - install-doc: $(MAKE) -C Documentation install @@ -926,104 +820,17 @@ quick-install-man: quick-install-html: $(MAKE) -C Documentation quick-install-html - -### Maintainer's dist rules -# -# None right now -# -# -# perf.spec: perf.spec.in -# sed -e 's/@@VERSION@@/$(PERF_VERSION)/g' < $< > $@+ -# mv $@+ $@ -# -# PERF_TARNAME=perf-$(PERF_VERSION) -# dist: perf.spec perf-archive$(X) configure -# ./perf-archive --format=tar \ -# --prefix=$(PERF_TARNAME)/ HEAD^{tree} > $(PERF_TARNAME).tar -# @mkdir -p $(PERF_TARNAME) -# @cp perf.spec configure $(PERF_TARNAME) -# @echo $(PERF_VERSION) > $(PERF_TARNAME)/version -# $(TAR) rf $(PERF_TARNAME).tar \ -# $(PERF_TARNAME)/perf.spec \ -# $(PERF_TARNAME)/configure \ -# $(PERF_TARNAME)/version -# @$(RM) -r $(PERF_TARNAME) -# gzip -f -9 $(PERF_TARNAME).tar -# -# htmldocs = perf-htmldocs-$(PERF_VERSION) -# manpages = perf-manpages-$(PERF_VERSION) -# dist-doc: -# $(RM) -r .doc-tmp-dir -# mkdir .doc-tmp-dir -# $(MAKE) -C Documentation WEBDOC_DEST=../.doc-tmp-dir install-webdoc -# cd .doc-tmp-dir && $(TAR) cf ../$(htmldocs).tar . -# gzip -n -9 -f $(htmldocs).tar -# : -# $(RM) -r .doc-tmp-dir -# mkdir -p .doc-tmp-dir/man1 .doc-tmp-dir/man5 .doc-tmp-dir/man7 -# $(MAKE) -C Documentation DESTDIR=./ \ -# man1dir=../.doc-tmp-dir/man1 \ -# man5dir=../.doc-tmp-dir/man5 \ -# man7dir=../.doc-tmp-dir/man7 \ -# install -# cd .doc-tmp-dir && $(TAR) cf ../$(manpages).tar . -# gzip -n -9 -f $(manpages).tar -# $(RM) -r .doc-tmp-dir -# -# rpm: dist -# $(RPMBUILD) -ta $(PERF_TARNAME).tar.gz - ### Cleaning rules -distclean: clean -# $(RM) configure - clean: $(RM) $(OUTPUT){*.o,*/*.o,*/*/*.o,*/*/*/*.o,$(LIB_FILE),perf-archive} - $(RM) $(ALL_PROGRAMS) $(BUILT_INS) perf - $(RM) $(TEST_PROGRAMS) + $(RM) $(ALL_PROGRAMS) perf $(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo $(OUTPUT)common-cmds.h TAGS tags cscope* - $(RM) -r $(PERF_TARNAME) .doc-tmp-dir - $(RM) $(PERF_TARNAME).tar.gz perf-core_$(PERF_VERSION)-*.tar.gz - $(RM) $(htmldocs).tar.gz $(manpages).tar.gz $(MAKE) -C Documentation/ clean - $(RM) $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)PERF-CFLAGS $(OUTPUT)PERF-BUILD-OPTIONS + $(RM) $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)PERF-CFLAGS @python util/setup.py clean --build-lib='$(OUTPUT)python' \ --build-temp='$(OUTPUT)python/temp' .PHONY: all install clean strip .PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell .PHONY: .FORCE-PERF-VERSION-FILE TAGS tags cscope .FORCE-PERF-CFLAGS -.PHONY: .FORCE-PERF-BUILD-OPTIONS - -### Make sure built-ins do not have dups and listed in perf.c -# -check-builtins:: - ./check-builtins.sh - -### Test suite coverage testing -# -# None right now -# -# .PHONY: coverage coverage-clean coverage-build coverage-report -# -# coverage: -# $(MAKE) coverage-build -# $(MAKE) coverage-report -# -# coverage-clean: -# rm -f *.gcda *.gcno -# -# COVERAGE_CFLAGS = $(CFLAGS) -O0 -ftest-coverage -fprofile-arcs -# COVERAGE_LDFLAGS = $(CFLAGS) -O0 -lgcov -# -# coverage-build: coverage-clean -# $(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" all -# $(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" \ -# -j1 test -# -# coverage-report: -# gcov -b *.c */*.c -# grep '^function.*called 0 ' *.c.gcov */*.c.gcov \ -# | sed -e 's/\([^:]*\)\.gcov: *function \([^ ]*\) called.*/\1: \2/' \ -# | tee coverage-untested-functions diff --git a/tools/perf/util/exec_cmd.c b/tools/perf/util/exec_cmd.c index 67eeff571568..7adf4ad15d8f 100644 --- a/tools/perf/util/exec_cmd.c +++ b/tools/perf/util/exec_cmd.c @@ -11,31 +11,12 @@ static const char *argv0_path; const char *system_path(const char *path) { -#ifdef RUNTIME_PREFIX - static const char *prefix; -#else static const char *prefix = PREFIX; -#endif struct strbuf d = STRBUF_INIT; if (is_absolute_path(path)) return path; -#ifdef RUNTIME_PREFIX - assert(argv0_path); - assert(is_absolute_path(argv0_path)); - - if (!prefix && - !(prefix = strip_path_suffix(argv0_path, PERF_EXEC_PATH)) && - !(prefix = strip_path_suffix(argv0_path, BINDIR)) && - !(prefix = strip_path_suffix(argv0_path, "perf"))) { - prefix = PREFIX; - fprintf(stderr, "RUNTIME_PREFIX requested, " - "but prefix computation failed. " - "Using static fallback '%s'.\n", prefix); - } -#endif - strbuf_addf(&d, "%s/%s", prefix, path); path = strbuf_detach(&d, NULL); return path; From 1d4610527bc71d3f9eea520fc51a02d54f79dcd0 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 16 Feb 2011 13:43:22 -0500 Subject: [PATCH 316/706] xen-pcifront: Sanity check the MSI/MSI-X values Check the returned vector values for any values that are odd or plain incorrect (say vector value zero), and if so print a warning. Also fixup the return values. This patch was precipiated by the Xen PCIBack returning the incorrect values due to how it was retrieving PIRQ values. This has been fixed in the xen-pciback by "xen/pciback: Utilize 'xen_pirq_from_irq' to get PIRQ value" patch. Reviewed-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- drivers/pci/xen-pcifront.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c index 030ce3743439..5c7b6ad68056 100644 --- a/drivers/pci/xen-pcifront.c +++ b/drivers/pci/xen-pcifront.c @@ -277,18 +277,24 @@ static int pci_frontend_enable_msix(struct pci_dev *dev, if (likely(!err)) { if (likely(!op.value)) { /* we get the result */ - for (i = 0; i < nvec; i++) + for (i = 0; i < nvec; i++) { + if (op.msix_entries[i].vector <= 0) { + dev_warn(&dev->dev, "MSI-X entry %d is invalid: %d!\n", + i, op.msix_entries[i].vector); + err = -EINVAL; + *(*vector+i) = -1; + continue; + } *(*vector+i) = op.msix_entries[i].vector; - return 0; + } } else { printk(KERN_DEBUG "enable msix get value %x\n", op.value); - return op.value; } } else { dev_err(&dev->dev, "enable msix get err %x\n", err); - return err; } + return err; } static void pci_frontend_disable_msix(struct pci_dev *dev) @@ -325,6 +331,12 @@ static int pci_frontend_enable_msi(struct pci_dev *dev, int **vector) err = do_pci_op(pdev, &op); if (likely(!err)) { *(*vector) = op.value; + if (op.value <= 0) { + dev_warn(&dev->dev, "MSI entry is invalid: %d!\n", + op.value); + err = -EINVAL; + *(*vector) = -1; + } } else { dev_err(&dev->dev, "pci frontend enable msi failed for dev " "%x:%x\n", op.bus, op.devfn); From 55cb8cd45e0600df1473489518d7f12ce1bbe973 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 16 Feb 2011 13:43:04 -0500 Subject: [PATCH 317/706] pci/xen: Use xen_allocate_pirq_msi instead of xen_allocate_pirq xen_allocate_pirq -> xen_map_pirq_gsi -> PHYSDEVOP_alloc_irq_vector IFF xen_initial_domain() in addition to the kernel side book-keeping side of things (set chip and handler, update irq_info etc) whereas xen_allocate_pirq_msi just does the kernel book keeping. Also xen_allocate_pirq allocates an IRQ in the 1-1 GSI space whereas xen_allocate_pirq_msi allocates a dynamic one in the >GSI IRQ space. All of this is uneccessary as this code path is only executed when we run as a domU PV guest with an MSI/MSI-X PCI card passed in. Hence we can jump straight to allocating an dynamic IRQ (and binding it to the proper PIRQ) and skip the rest. In short: this change is a cosmetic one. Reviewed-by: Ian Campbell Reviewed-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/pci/xen.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 25cd4a07d09f..6432f751ee4f 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -157,14 +157,14 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) goto error; i = 0; list_for_each_entry(msidesc, &dev->msi_list, list) { - irq = xen_allocate_pirq(v[i], 0, /* not sharable */ + xen_allocate_pirq_msi( (type == PCI_CAP_ID_MSIX) ? - "pcifront-msi-x" : "pcifront-msi"); + "pcifront-msi-x" : "pcifront-msi", + &irq, &v[i], XEN_ALLOC_IRQ); if (irq < 0) { ret = -1; goto free; } - ret = set_irq_msi(irq, msidesc); if (ret) goto error_while; From 13884c6680973f0ce3483dc59b636b4962d6dafe Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 17 Dec 2010 16:33:53 +0100 Subject: [PATCH 318/706] x86/pci: Remove unused variable |arch/x86/pci/ce4100.c: In function `ce4100_conf_read': |arch/x86/pci/ce4100.c:257:9: warning: unused variable `retval' Signed-off-by: Sebastian Andrzej Siewior Cc: dirk.brandewie@gmail.com LKML-Reference: <1292600033-12271-16-git-send-email-bigeasy@linutronix.de> Signed-off-by: Thomas Gleixner --- arch/x86/pci/ce4100.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c index 85b68ef5e809..c63c6d3dd8e8 100644 --- a/arch/x86/pci/ce4100.c +++ b/arch/x86/pci/ce4100.c @@ -254,7 +254,7 @@ int bridge_read(unsigned int devfn, int reg, int len, u32 *value) static int ce4100_conf_read(unsigned int seg, unsigned int bus, unsigned int devfn, int reg, int len, u32 *value) { - int i, retval = 1; + int i; if (bus == 1) { for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) { From db1c1cce4a653dcbe6949c72ae7b9f42cab1b929 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Fri, 18 Feb 2011 10:07:25 +0100 Subject: [PATCH 319/706] ntp: Remove redundant and incorrect parameter check The ADJ_SETOFFSET code redundantly checks the range of the nanoseconds field of the time value. This field is checked again in the subsequent call to timekeeping_inject_offset(). Also, as is, the check will not detect whether the number of microseconds is out of range. Let timekeeping_inject_offset() do the error checking. Signed-off-by: Richard Cochran Cc: johnstul@us.ibm.com LKML-Reference: <20110218090724.GA2924@riccoc20.at.omicron.at> Signed-off-by: Thomas Gleixner --- kernel/time/ntp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 5ac593267a26..5f1bb8e2008f 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -650,13 +650,13 @@ int do_adjtimex(struct timex *txc) if (txc->modes & ADJ_SETOFFSET) { struct timespec delta; - if ((unsigned long)txc->time.tv_usec >= NSEC_PER_SEC) - return -EINVAL; delta.tv_sec = txc->time.tv_sec; delta.tv_nsec = txc->time.tv_usec; if (!(txc->modes & ADJ_NANO)) delta.tv_nsec *= 1000; - timekeeping_inject_offset(&delta); + result = timekeeping_inject_offset(&delta); + if (result) + return result; } getnstimeofday(&ts); From cc0f89c4a426fcd6400a89e9e34e4a8851abef76 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 17 Feb 2011 12:02:23 -0500 Subject: [PATCH 320/706] pci/xen: Cleanup: convert int** to int[] Cleanup code. Cosmetic change to make the code look easier to read. Reviewed-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/pci.h | 8 ++++---- arch/x86/pci/xen.c | 4 ++-- drivers/pci/xen-pcifront.c | 12 ++++++------ 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h index 2329b3eaf8d3..aa8620989162 100644 --- a/arch/x86/include/asm/xen/pci.h +++ b/arch/x86/include/asm/xen/pci.h @@ -27,16 +27,16 @@ static inline void __init xen_setup_pirqs(void) * its own functions. */ struct xen_pci_frontend_ops { - int (*enable_msi)(struct pci_dev *dev, int **vectors); + int (*enable_msi)(struct pci_dev *dev, int vectors[]); void (*disable_msi)(struct pci_dev *dev); - int (*enable_msix)(struct pci_dev *dev, int **vectors, int nvec); + int (*enable_msix)(struct pci_dev *dev, int vectors[], int nvec); void (*disable_msix)(struct pci_dev *dev); }; extern struct xen_pci_frontend_ops *xen_pci_frontend; static inline int xen_pci_frontend_enable_msi(struct pci_dev *dev, - int **vectors) + int vectors[]) { if (xen_pci_frontend && xen_pci_frontend->enable_msi) return xen_pci_frontend->enable_msi(dev, vectors); @@ -48,7 +48,7 @@ static inline void xen_pci_frontend_disable_msi(struct pci_dev *dev) xen_pci_frontend->disable_msi(dev); } static inline int xen_pci_frontend_enable_msix(struct pci_dev *dev, - int **vectors, int nvec) + int vectors[], int nvec) { if (xen_pci_frontend && xen_pci_frontend->enable_msix) return xen_pci_frontend->enable_msix(dev, vectors, nvec); diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 6432f751ee4f..30fdd09dea05 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -150,9 +150,9 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) return -ENOMEM; if (type == PCI_CAP_ID_MSIX) - ret = xen_pci_frontend_enable_msix(dev, &v, nvec); + ret = xen_pci_frontend_enable_msix(dev, v, nvec); else - ret = xen_pci_frontend_enable_msi(dev, &v); + ret = xen_pci_frontend_enable_msi(dev, v); if (ret) goto error; i = 0; diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c index 5c7b6ad68056..492b7d807fe8 100644 --- a/drivers/pci/xen-pcifront.c +++ b/drivers/pci/xen-pcifront.c @@ -243,7 +243,7 @@ struct pci_ops pcifront_bus_ops = { #ifdef CONFIG_PCI_MSI static int pci_frontend_enable_msix(struct pci_dev *dev, - int **vector, int nvec) + int vector[], int nvec) { int err; int i; @@ -282,10 +282,10 @@ static int pci_frontend_enable_msix(struct pci_dev *dev, dev_warn(&dev->dev, "MSI-X entry %d is invalid: %d!\n", i, op.msix_entries[i].vector); err = -EINVAL; - *(*vector+i) = -1; + vector[i] = -1; continue; } - *(*vector+i) = op.msix_entries[i].vector; + vector[i] = op.msix_entries[i].vector; } } else { printk(KERN_DEBUG "enable msix get value %x\n", @@ -316,7 +316,7 @@ static void pci_frontend_disable_msix(struct pci_dev *dev) dev_err(&dev->dev, "pci_disable_msix get err %x\n", err); } -static int pci_frontend_enable_msi(struct pci_dev *dev, int **vector) +static int pci_frontend_enable_msi(struct pci_dev *dev, int vector[]) { int err; struct xen_pci_op op = { @@ -330,12 +330,12 @@ static int pci_frontend_enable_msi(struct pci_dev *dev, int **vector) err = do_pci_op(pdev, &op); if (likely(!err)) { - *(*vector) = op.value; + vector[0] = op.value; if (op.value <= 0) { dev_warn(&dev->dev, "MSI entry is invalid: %d!\n", op.value); err = -EINVAL; - *(*vector) = -1; + vector[0] = -1; } } else { dev_err(&dev->dev, "pci frontend enable msi failed for dev " From 3d74a539ae07a8f3c061332e426fc07b2310cf05 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 17 Feb 2011 16:12:51 -0500 Subject: [PATCH 321/706] pci/xen: When free-ing MSI-X/MSI irq->desc also use generic code. This code path is only run when an MSI/MSI-X PCI device is passed in to PV DomU. In 2.6.37 time-frame we over-wrote the default cleanup handler for MSI/MSI-X irq->desc to be "xen_teardown_msi_irqs". That function calls the the xen-pcifront driver which can tell the backend to cleanup/take back the MSI/MSI-X device. However, we forgot to continue the process of free-ing the MSI/MSI-X device resources (irq->desc) in the PV domU side. Which is what the default cleanup handler: default_teardown_msi_irqs did. Hence we would leak IRQ descriptors. Without this patch, doing "rmmod igbvf;modprobe igbvf" multiple times ends with abandoned IRQ descriptors: 28: 5 xen-pirq-pcifront-msi-x 29: 8 xen-pirq-pcifront-msi-x ... 130: 10 xen-pirq-pcifront-msi-x with the end result of running out of IRQ descriptors. Reviewed-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/pci/xen.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 30fdd09dea05..57afd1da491d 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -193,6 +193,9 @@ static void xen_teardown_msi_irqs(struct pci_dev *dev) xen_pci_frontend_disable_msix(dev); else xen_pci_frontend_disable_msi(dev); + + /* Free the IRQ's and the msidesc using the generic code. */ + default_teardown_msi_irqs(dev); } static void xen_teardown_msi_irq(unsigned int irq) From 5df91509d324d44cfb11e55d9cb02fe18b53b045 Mon Sep 17 00:00:00 2001 From: "jacob.jun.pan@linux.intel.com" Date: Fri, 18 Feb 2011 13:42:54 -0800 Subject: [PATCH 322/706] x86: mrst: Remove apb timer read workaround APB timer current count was unreliable in the earlier silicon, which could result in time going backwards. This problem has been fixed in the current silicon stepping. This patch removes the workaround which was used to check and prevent timer rolling back when APB timer is used as clocksource device. The workaround code was also flawed by potential race condition around the cached read value last_read. Though a fix can be done by assigning last_read to a local variable at the beginning of apbt_read_clocksource(), but this is not necessary anymore. [ tglx: A sane timer on an Intel chip - I can't believe it ] Signed-off-by: Jacob Pan Cc: Arjan van de Ven Cc: Alan Cox LKML-Reference: <1298065374-25532-1-git-send-email-jacob.jun.pan@linux.intel.com> Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apb_timer.c | 58 ++----------------------------------- 1 file changed, 3 insertions(+), 55 deletions(-) diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 7c9ab59653e8..afc406498c9d 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -506,64 +506,12 @@ static int apbt_next_event(unsigned long delta, return 0; } -/* - * APB timer clock is not in sync with pclk on Langwell, which translates to - * unreliable read value caused by sampling error. the error does not add up - * overtime and only happens when sampling a 0 as a 1 by mistake. so the time - * would go backwards. the following code is trying to prevent time traveling - * backwards. little bit paranoid. - */ static cycle_t apbt_read_clocksource(struct clocksource *cs) { - unsigned long t0, t1, t2; - static unsigned long last_read; + unsigned long current_count; -bad_count: - t1 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - t2 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - if (unlikely(t1 < t2)) { - pr_debug("APBT: read current count error %lx:%lx:%lx\n", - t1, t2, t2 - t1); - goto bad_count; - } - /* - * check against cached last read, makes sure time does not go back. - * it could be a normal rollover but we will do tripple check anyway - */ - if (unlikely(t2 > last_read)) { - /* check if we have a normal rollover */ - unsigned long raw_intr_status = - apbt_readl_reg(APBTMRS_RAW_INT_STATUS); - /* - * cs timer interrupt is masked but raw intr bit is set if - * rollover occurs. then we read EOI reg to clear it. - */ - if (raw_intr_status & (1 << phy_cs_timer_id)) { - apbt_readl(phy_cs_timer_id, APBTMR_N_EOI); - goto out; - } - pr_debug("APB CS going back %lx:%lx:%lx ", - t2, last_read, t2 - last_read); -bad_count_x3: - pr_debug("triple check enforced\n"); - t0 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - udelay(1); - t1 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - udelay(1); - t2 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - if ((t2 > t1) || (t1 > t0)) { - printk(KERN_ERR "Error: APB CS tripple check failed\n"); - goto bad_count_x3; - } - } -out: - last_read = t2; - return (cycle_t)~t2; + current_count = apbt_readl(phy_cs_timer_id, APBTMR_N_CURRENT_VALUE); + return (cycle_t)~current_count; } static int apbt_clocksource_register(void) From e7bcecb7b1d29b9ad5af939149a945658620ca8f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Feb 2011 17:12:57 +0100 Subject: [PATCH 323/706] genirq: Make nr_irqs runtime expandable We face more and more the requirement to expand nr_irqs at runtime. The reason are irq expanders which can not be detected in the early boot stage. So we speculate nr_irqs to have enough room. Further Xen needs extra irq numbers and we really want to avoid adding more "detection" code into the early boot. There is no real good reason why we need to limit nr_irqs at early boot. Allow the allocation code to expand nr_irqs. We have already 8k extra number space in the allocation bitmap, so lets use it. Signed-off-by: Thomas Gleixner --- kernel/irq/irqdesc.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index a250d3a0af12..6f6644f819dd 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -206,6 +206,14 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) return NULL; } +static int irq_expand_nr_irqs(unsigned int cnt) +{ + if (nr_irqs + cnt > IRQ_BITMAP_BITS) + return -ENOMEM; + nr_irqs += cnt; + return 0; +} + int __init early_irq_init(void) { int i, initcnt, node = first_online_node; @@ -287,6 +295,12 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) { return start; } + +static int irq_expand_nr_irqs(unsigned int cnt) +{ + return -ENOMEM; +} + #endif /* !CONFIG_SPARSE_IRQ */ /* Dynamic interrupt handling */ @@ -335,9 +349,11 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node) if (irq >=0 && start != irq) goto err; - ret = -ENOMEM; - if (start >= nr_irqs) - goto err; + if (start >= nr_irqs) { + ret = irq_expand_nr_irqs(cnt); + if (ret) + goto err; + } bitmap_set(allocated_irqs, start, cnt); mutex_unlock(&sparse_irq_lock); From 43abe43ce0619d744c7a5bb15cce075e532b53b7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 12 Feb 2011 12:10:49 +0100 Subject: [PATCH 324/706] genirq: Add missing buslock to set_irq_type(), set_irq_wake() chips behind a slow bus cannot update the chip under desc->lock, but we miss the chip_buslock/chip_bus_sync_unlock() calls around the set type and set wake functions. Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 2 ++ kernel/irq/manage.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index baa5c4acad83..9639ab8bece0 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -65,9 +65,11 @@ int set_irq_type(unsigned int irq, unsigned int type) if (type == IRQ_TYPE_NONE) return 0; + chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); ret = __irq_set_trigger(desc, irq, type); raw_spin_unlock_irqrestore(&desc->lock, flags); + chip_bus_sync_unlock(desc); return ret; } EXPORT_SYMBOL(set_irq_type); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ba84307fbf24..a400db220cf3 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -454,6 +454,7 @@ int set_irq_wake(unsigned int irq, unsigned int on) /* wakeup-capable irqs can be shared between drivers that * don't need to have the same sleep mode behaviors. */ + chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); if (on) { if (desc->wake_depth++ == 0) { @@ -476,6 +477,7 @@ int set_irq_wake(unsigned int irq, unsigned int on) } raw_spin_unlock_irqrestore(&desc->lock, flags); + chip_bus_sync_unlock(desc); return ret; } EXPORT_SYMBOL(set_irq_wake); From a0cd9ca2b907d7ee26575e7b63ac92dad768a75e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Feb 2011 11:36:33 +0100 Subject: [PATCH 325/706] genirq: Namespace cleanup The irq namespace has become quite convoluted. My bad. Clean it up and deprecate the old functions. All new functions follow the scheme: irq number based: irq_set/get/xxx/_xxx(unsigned int irq, ...) irq_data based: irq_data_set/get/xxx/_xxx(struct irq_data *d, ....) irq_desc based: irq_desc_get_xxx(struct irq_desc *desc) Signed-off-by: Thomas Gleixner --- include/linux/interrupt.h | 14 +++++-- include/linux/irq.h | 79 ++++++++++++++++++++++++++++++++------- include/linux/irqdesc.h | 44 ++++++++++++++++++++-- kernel/irq/chip.c | 28 +++++++------- kernel/irq/manage.c | 6 +-- 5 files changed, 133 insertions(+), 38 deletions(-) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 56b7c97aaf0a..7834726dd95b 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -346,16 +346,24 @@ static inline void enable_irq_lockdep_irqrestore(unsigned int irq, unsigned long } /* IRQ wakeup (PM) control: */ -extern int set_irq_wake(unsigned int irq, unsigned int on); +extern int irq_set_irq_wake(unsigned int irq, unsigned int on); + +#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT +/* Please do not use: Use the replacement functions instead */ +static inline int set_irq_wake(unsigned int irq, unsigned int on) +{ + return irq_set_irq_wake(irq, on); +} +#endif static inline int enable_irq_wake(unsigned int irq) { - return set_irq_wake(irq, 1); + return irq_set_irq_wake(irq, 1); } static inline int disable_irq_wake(unsigned int irq) { - return set_irq_wake(irq, 0); + return irq_set_irq_wake(irq, 0); } #else /* !CONFIG_GENERIC_HARDIRQS */ diff --git a/include/linux/irq.h b/include/linux/irq.h index 80fcb53057bc..e9f847d56c4d 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -292,8 +292,7 @@ set_irq_handler(unsigned int irq, irq_flow_handler_t handle) * IRQ_NOREQUEST and IRQ_NOPROBE) */ static inline void -set_irq_chained_handler(unsigned int irq, - irq_flow_handler_t handle) +set_irq_chained_handler(unsigned int irq, irq_flow_handler_t handle) { __set_irq_handler(irq, handle, 1, NULL); } @@ -312,12 +311,12 @@ static inline void irq_clear_status_flags(unsigned int irq, unsigned long clr) irq_modify_status(irq, clr, 0); } -static inline void set_irq_noprobe(unsigned int irq) +static inline void irq_set_noprobe(unsigned int irq) { irq_modify_status(irq, 0, IRQ_NOPROBE); } -static inline void set_irq_probe(unsigned int irq) +static inline void irq_set_probe(unsigned int irq) { irq_modify_status(irq, IRQ_NOPROBE, 0); } @@ -338,14 +337,14 @@ static inline void dynamic_irq_init(unsigned int irq) } /* Set/get chip/data for an IRQ: */ -extern int set_irq_chip(unsigned int irq, struct irq_chip *chip); -extern int set_irq_data(unsigned int irq, void *data); -extern int set_irq_chip_data(unsigned int irq, void *data); -extern int set_irq_type(unsigned int irq, unsigned int type); -extern int set_irq_msi(unsigned int irq, struct msi_desc *entry); +extern int irq_set_chip(unsigned int irq, struct irq_chip *chip); +extern int irq_set_handler_data(unsigned int irq, void *data); +extern int irq_set_chip_data(unsigned int irq, void *data); +extern int irq_set_irq_type(unsigned int irq, unsigned int type); +extern int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry); extern struct irq_data *irq_get_irq_data(unsigned int irq); -static inline struct irq_chip *get_irq_chip(unsigned int irq) +static inline struct irq_chip *irq_get_chip(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? d->chip : NULL; @@ -356,7 +355,7 @@ static inline struct irq_chip *irq_data_get_irq_chip(struct irq_data *d) return d->chip; } -static inline void *get_irq_chip_data(unsigned int irq) +static inline void *irq_get_chip_data(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? d->chip_data : NULL; @@ -367,18 +366,18 @@ static inline void *irq_data_get_irq_chip_data(struct irq_data *d) return d->chip_data; } -static inline void *get_irq_data(unsigned int irq) +static inline void *irq_get_handler_data(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? d->handler_data : NULL; } -static inline void *irq_data_get_irq_data(struct irq_data *d) +static inline void *irq_data_get_irq_handler_data(struct irq_data *d) { return d->handler_data; } -static inline struct msi_desc *get_irq_msi(unsigned int irq) +static inline struct msi_desc *irq_get_msi_desc(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? d->msi_desc : NULL; @@ -389,6 +388,58 @@ static inline struct msi_desc *irq_data_get_msi(struct irq_data *d) return d->msi_desc; } +#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT +/* Please do not use: Use the replacement functions instead */ +static inline int set_irq_chip(unsigned int irq, struct irq_chip *chip) +{ + return irq_set_chip(irq, chip); +} +static inline int set_irq_data(unsigned int irq, void *data) +{ + return irq_set_handler_data(irq, data); +} +static inline int set_irq_chip_data(unsigned int irq, void *data) +{ + return irq_set_chip_data(irq, data); +} +static inline int set_irq_type(unsigned int irq, unsigned int type) +{ + return irq_set_irq_type(irq, type); +} +static inline int set_irq_msi(unsigned int irq, struct msi_desc *entry) +{ + return irq_set_msi_desc(irq, entry); +} +static inline struct irq_chip *get_irq_chip(unsigned int irq) +{ + return irq_get_chip(irq); +} +static inline void *get_irq_chip_data(unsigned int irq) +{ + return irq_get_chip_data(irq); +} +static inline void *get_irq_data(unsigned int irq) +{ + return irq_get_handler_data(irq); +} +static inline void *irq_data_get_irq_data(struct irq_data *d) +{ + return irq_data_get_irq_handler_data(d); +} +static inline struct msi_desc *get_irq_msi(unsigned int irq) +{ + return irq_get_msi_desc(irq); +} +static inline void set_irq_noprobe(unsigned int irq) +{ + irq_set_noprobe(irq); +} +static inline void set_irq_probe(unsigned int irq) +{ + irq_set_probe(irq); +} +#endif + int irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node); void irq_free_descs(unsigned int irq, unsigned int cnt); int irq_reserve_irqs(unsigned int from, unsigned int cnt); diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index bfef56dadddb..64794dec93b6 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -98,10 +98,46 @@ static inline struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) #ifdef CONFIG_GENERIC_HARDIRQS -#define get_irq_desc_chip(desc) ((desc)->irq_data.chip) -#define get_irq_desc_chip_data(desc) ((desc)->irq_data.chip_data) -#define get_irq_desc_data(desc) ((desc)->irq_data.handler_data) -#define get_irq_desc_msi(desc) ((desc)->irq_data.msi_desc) +static inline struct irq_chip *irq_desc_get_chip(struct irq_desc *desc) +{ + return desc->irq_data.chip; +} + +static inline void *irq_desc_get_chip_data(struct irq_desc *desc) +{ + return desc->irq_data.chip_data; +} + +static inline void *irq_desc_get_handler_data(struct irq_desc *desc) +{ + return desc->irq_data.handler_data; +} + +static inline struct msi_desc *irq_desc_get_msi_desc(struct irq_desc *desc) +{ + return desc->irq_data.msi_desc; +} + +#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT +static inline struct irq_chip *get_irq_desc_chip(struct irq_desc *desc) +{ + return irq_desc_get_chip(desc); +} +static inline void *get_irq_desc_data(struct irq_desc *desc) +{ + return irq_desc_get_handler_data(desc); +} + +static inline void *get_irq_desc_chip_data(struct irq_desc *desc) +{ + return irq_desc_get_chip_data(desc); +} + +static inline struct msi_desc *get_irq_desc_msi(struct irq_desc *desc) +{ + return irq_desc_get_msi_desc(desc); +} +#endif /* * Architectures call this to let the generic IRQ layer diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 9639ab8bece0..622b55ac0e09 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -19,11 +19,11 @@ #include "internals.h" /** - * set_irq_chip - set the irq chip for an irq + * irq_set_chip - set the irq chip for an irq * @irq: irq number * @chip: pointer to irq chip description structure */ -int set_irq_chip(unsigned int irq, struct irq_chip *chip) +int irq_set_chip(unsigned int irq, struct irq_chip *chip) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; @@ -43,14 +43,14 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip) return 0; } -EXPORT_SYMBOL(set_irq_chip); +EXPORT_SYMBOL(irq_set_chip); /** - * set_irq_type - set the irq trigger type for an irq + * irq_set_type - set the irq trigger type for an irq * @irq: irq number * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h */ -int set_irq_type(unsigned int irq, unsigned int type) +int irq_set_irq_type(unsigned int irq, unsigned int type) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; @@ -72,16 +72,16 @@ int set_irq_type(unsigned int irq, unsigned int type) chip_bus_sync_unlock(desc); return ret; } -EXPORT_SYMBOL(set_irq_type); +EXPORT_SYMBOL(irq_set_irq_type); /** - * set_irq_data - set irq type data for an irq + * irq_set_handler_data - set irq handler data for an irq * @irq: Interrupt number * @data: Pointer to interrupt specific data * * Set the hardware irq controller data for an irq */ -int set_irq_data(unsigned int irq, void *data) +int irq_set_handler_data(unsigned int irq, void *data) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; @@ -97,16 +97,16 @@ int set_irq_data(unsigned int irq, void *data) raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } -EXPORT_SYMBOL(set_irq_data); +EXPORT_SYMBOL(irq_set_handler_data); /** - * set_irq_msi - set MSI descriptor data for an irq + * irq_set_msi_desc - set MSI descriptor data for an irq * @irq: Interrupt number * @entry: Pointer to MSI descriptor data * * Set the MSI descriptor entry for an irq */ -int set_irq_msi(unsigned int irq, struct msi_desc *entry) +int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; @@ -126,13 +126,13 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry) } /** - * set_irq_chip_data - set irq chip data for an irq + * irq_set_chip_data - set irq chip data for an irq * @irq: Interrupt number * @data: Pointer to chip specific data * * Set the hardware irq chip data for an irq */ -int set_irq_chip_data(unsigned int irq, void *data) +int irq_set_chip_data(unsigned int irq, void *data) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; @@ -154,7 +154,7 @@ int set_irq_chip_data(unsigned int irq, void *data) return 0; } -EXPORT_SYMBOL(set_irq_chip_data); +EXPORT_SYMBOL(irq_set_chip_data); struct irq_data *irq_get_irq_data(unsigned int irq) { diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index a400db220cf3..b1b4da9446e6 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -434,7 +434,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) } /** - * set_irq_wake - control irq power management wakeup + * irq_set_irq_wake - control irq power management wakeup * @irq: interrupt to control * @on: enable/disable power management wakeup * @@ -445,7 +445,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) * Wakeup mode lets this IRQ wake the system from sleep * states like "suspend to RAM". */ -int set_irq_wake(unsigned int irq, unsigned int on) +int irq_set_irq_wake(unsigned int irq, unsigned int on) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; @@ -480,7 +480,7 @@ int set_irq_wake(unsigned int irq, unsigned int on) chip_bus_sync_unlock(desc); return ret; } -EXPORT_SYMBOL(set_irq_wake); +EXPORT_SYMBOL(irq_set_irq_wake); /* * Internal function that tells the architecture code whether a From 1fa46f1f070961783661ae640cd2f6b2557f3885 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 16:46:58 +0100 Subject: [PATCH 326/706] genirq: Simplify affinity related code There is lot of #ifdef CONFIG_GENERIC_PENDING_IRQ along with duplicated code in the irq core. Move the #ifdeffery into one place and cleanup the code so it's readable. No functional change. Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 64 +++++++++++++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 23 deletions(-) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index b1b4da9446e6..99f3e9a3780c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -100,47 +100,70 @@ void irq_set_thread_affinity(struct irq_desc *desc) } } +#ifdef CONFIG_GENERIC_PENDING_IRQ +static inline bool irq_can_move_pcntxt(struct irq_desc *desc) +{ + return desc->status & IRQ_MOVE_PCNTXT; +} +static inline bool irq_move_pending(struct irq_desc *desc) +{ + return desc->status & IRQ_MOVE_PENDING; +} +static inline void +irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) +{ + cpumask_copy(desc->pending_mask, mask); +} +static inline void +irq_get_pending(struct cpumask *mask, struct irq_desc *desc) +{ + cpumask_copy(mask, desc->pending_mask); +} +#else +static inline bool irq_can_move_pcntxt(struct irq_desc *desc) { return true; } +static inline bool irq_move_pending(struct irq_desc *desc) { return false; } +static inline void +irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { } +static inline void +irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } +#endif + /** * irq_set_affinity - Set the irq affinity of a given irq * @irq: Interrupt to set affinity * @cpumask: cpumask * */ -int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) +int irq_set_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_chip *chip = desc->irq_data.chip; unsigned long flags; + int ret = 0; if (!chip->irq_set_affinity) return -EINVAL; raw_spin_lock_irqsave(&desc->lock, flags); -#ifdef CONFIG_GENERIC_PENDING_IRQ - if (desc->status & IRQ_MOVE_PCNTXT) { - if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) { - cpumask_copy(desc->irq_data.affinity, cpumask); + if (irq_can_move_pcntxt(desc)) { + ret = chip->irq_set_affinity(&desc->irq_data, mask, false); + if (!ret) { + cpumask_copy(desc->irq_data.affinity, mask); irq_set_thread_affinity(desc); } - } - else { + } else { desc->status |= IRQ_MOVE_PENDING; - cpumask_copy(desc->pending_mask, cpumask); + irq_copy_pending(desc, mask); } -#else - if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) { - cpumask_copy(desc->irq_data.affinity, cpumask); - irq_set_thread_affinity(desc); - } -#endif + if (desc->affinity_notify) { kref_get(&desc->affinity_notify->kref); schedule_work(&desc->affinity_notify->work); } desc->status |= IRQ_AFFINITY_SET; raw_spin_unlock_irqrestore(&desc->lock, flags); - return 0; + return ret; } int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) @@ -167,18 +190,13 @@ static void irq_affinity_notify(struct work_struct *work) cpumask_var_t cpumask; unsigned long flags; - if (!desc) - goto out; - - if (!alloc_cpumask_var(&cpumask, GFP_KERNEL)) + if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL)) goto out; raw_spin_lock_irqsave(&desc->lock, flags); -#ifdef CONFIG_GENERIC_PENDING_IRQ - if (desc->status & IRQ_MOVE_PENDING) - cpumask_copy(cpumask, desc->pending_mask); + if (irq_move_pending(desc)) + irq_get_pending(cpumask, desc); else -#endif cpumask_copy(cpumask, desc->irq_data.affinity); raw_spin_unlock_irqrestore(&desc->lock, flags); From b008207cbd0d5ce606a1a2ac52826e0ab37d0b99 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 17:30:50 +0100 Subject: [PATCH 327/706] genirq: Rremove redundant check IRQ_NO_BALANCING is already checked in irq_can_set_affinity() above, no need to check it again. Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 99f3e9a3780c..591c927b135c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -256,6 +256,7 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); */ static int setup_affinity(unsigned int irq, struct irq_desc *desc) { + /* Excludes PER_CPU and NO_BALANCE interrupts */ if (!irq_can_set_affinity(irq)) return 0; @@ -263,7 +264,7 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc) * Preserve an userspace affinity setup, but make sure that * one of the targets is online. */ - if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { + if (desc->status & (IRQ_AFFINITY_SET)) { if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask) < nr_cpu_ids) goto set_affinity; From 569bda8df11effa03e618729293c7961696abb10 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 17:05:08 +0100 Subject: [PATCH 328/706] genirq: Always apply cpu online mask If the affinity had been set by the user, then a later request_irq() will honour that setting. But online cpus can have changed. So apply the online mask and for this case as well. Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 591c927b135c..ade65bfb466d 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -256,6 +256,8 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); */ static int setup_affinity(unsigned int irq, struct irq_desc *desc) { + struct cpumask *set = irq_default_affinity; + /* Excludes PER_CPU and NO_BALANCE interrupts */ if (!irq_can_set_affinity(irq)) return 0; @@ -265,15 +267,13 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc) * one of the targets is online. */ if (desc->status & (IRQ_AFFINITY_SET)) { - if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask) - < nr_cpu_ids) - goto set_affinity; + if (cpumask_intersects(desc->irq_data.affinity, + cpu_online_mask)) + set = desc->irq_data.affinity; else desc->status &= ~IRQ_AFFINITY_SET; } - - cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity); -set_affinity: + cpumask_and(desc->irq_data.affinity, cpu_online_mask, set); desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false); return 0; From 3b8249e759c701c4a82f99d957be651a7657bf6f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 16:02:20 +0100 Subject: [PATCH 329/706] genirq: Do not copy affinity before set While rumaging through arch code I found that there are a few workarounds which deal with the fact that the initial affinity setting from request_irq() copies the mask into irq_data->affinity before the chip code is called. In the normal path we unconditionally copy the mask when the chip code returns 0. Copy after the code is called and add a return code IRQ_SET_MASK_OK_NOCOPY for the chip functions, which prevents the copy. That way we see the real mask when the chip function decided to truncate it further as some arches do. IRQ_SET_MASK_OK is 0, which is the current behaviour. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 11 ++++++++++ kernel/irq/internals.h | 2 +- kernel/irq/manage.c | 47 ++++++++++++++++++++++++++++++------------ kernel/irq/proc.c | 2 +- 4 files changed, 47 insertions(+), 15 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index e9f847d56c4d..f5e900309d21 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -85,6 +85,17 @@ typedef void (*irq_flow_handler_t)(unsigned int irq, # define IRQ_NO_BALANCING_MASK IRQ_NO_BALANCING #endif +/* + * Return value for chip->irq_set_affinity() + * + * IRQ_SET_MASK_OK - OK, core updates irq_data.affinity + * IRQ_SET_MASK_NOCPY - OK, chip did update irq_data.affinity + */ +enum { + IRQ_SET_MASK_OK = 0, + IRQ_SET_MASK_OK_NOCOPY, +}; + struct msi_desc; /** diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 99c3bc8a6fb4..b5bfa24aa6a6 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -43,7 +43,7 @@ static inline void unregister_handler_proc(unsigned int irq, struct irqaction *action) { } #endif -extern int irq_select_affinity_usr(unsigned int irq); +extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); extern void irq_set_thread_affinity(struct irq_desc *desc); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ade65bfb466d..dc95d53df510 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -148,9 +148,12 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask) if (irq_can_move_pcntxt(desc)) { ret = chip->irq_set_affinity(&desc->irq_data, mask, false); - if (!ret) { + switch (ret) { + case IRQ_SET_MASK_OK: cpumask_copy(desc->irq_data.affinity, mask); + case IRQ_SET_MASK_OK_NOCOPY: irq_set_thread_affinity(desc); + ret = 0; } } else { desc->status |= IRQ_MOVE_PENDING; @@ -254,9 +257,12 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); /* * Generic version of the affinity autoselector. */ -static int setup_affinity(unsigned int irq, struct irq_desc *desc) +static int +setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) { + struct irq_chip *chip = get_irq_desc_chip(desc); struct cpumask *set = irq_default_affinity; + int ret; /* Excludes PER_CPU and NO_BALANCE interrupts */ if (!irq_can_set_affinity(irq)) @@ -273,13 +279,20 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc) else desc->status &= ~IRQ_AFFINITY_SET; } - cpumask_and(desc->irq_data.affinity, cpu_online_mask, set); - desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false); + cpumask_and(mask, cpu_online_mask, set); + ret = chip->irq_set_affinity(&desc->irq_data, mask, false); + switch (ret) { + case IRQ_SET_MASK_OK: + cpumask_copy(desc->irq_data.affinity, mask); + case IRQ_SET_MASK_OK_NOCOPY: + irq_set_thread_affinity(desc); + } return 0; } #else -static inline int setup_affinity(unsigned int irq, struct irq_desc *d) +static inline int +setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask) { return irq_select_affinity(irq); } @@ -288,23 +301,23 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *d) /* * Called when affinity is set via /proc/irq */ -int irq_select_affinity_usr(unsigned int irq) +int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; int ret; raw_spin_lock_irqsave(&desc->lock, flags); - ret = setup_affinity(irq, desc); + ret = setup_affinity(irq, desc, mask); if (!ret) irq_set_thread_affinity(desc); raw_spin_unlock_irqrestore(&desc->lock, flags); - return ret; } #else -static inline int setup_affinity(unsigned int irq, struct irq_desc *desc) +static inline int +setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) { return 0; } @@ -765,8 +778,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) struct irqaction *old, **old_ptr; const char *old_name = NULL; unsigned long flags; - int nested, shared = 0; - int ret; + int ret, nested, shared = 0; + cpumask_var_t mask; if (!desc) return -EINVAL; @@ -831,6 +844,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) new->thread = t; } + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { + ret = -ENOMEM; + goto out_thread; + } + /* * The following block of code has to be executed atomically */ @@ -876,7 +894,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) new->flags & IRQF_TRIGGER_MASK); if (ret) - goto out_thread; + goto out_mask; } else compat_irq_chip_set_default_handler(desc); #if defined(CONFIG_IRQ_PER_CPU) @@ -903,7 +921,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) desc->status |= IRQ_NO_BALANCING; /* Set default affinity mask once everything is setup */ - setup_affinity(irq, desc); + setup_affinity(irq, desc, mask); } else if ((new->flags & IRQF_TRIGGER_MASK) && (new->flags & IRQF_TRIGGER_MASK) @@ -956,6 +974,9 @@ mismatch: #endif ret = -EBUSY; +out_mask: + free_cpumask_var(mask); + out_thread: raw_spin_unlock_irqrestore(&desc->lock, flags); if (new->thread) { diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 6c8a2a9f8a7b..a46bd762db47 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -89,7 +89,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, if (!cpumask_intersects(new_value, cpu_online_mask)) { /* Special case for empty set - allow the architecture code to set default SMP affinity. */ - err = irq_select_affinity_usr(irq) ? -EINVAL : count; + err = irq_select_affinity_usr(irq, new_value) ? -EINVAL : count; } else { irq_set_affinity(irq, new_value); err = count; From 2b879eaf095878430c38cbd95e5c0fc4ce65ad8e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 14 Feb 2011 11:25:02 +0100 Subject: [PATCH 330/706] genirq: Remove redundant thread affinity setting Thread affinity is already set by setup_affinity(). Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index dc95d53df510..33a6ee0ac68f 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -309,8 +309,6 @@ int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask) raw_spin_lock_irqsave(&desc->lock, flags); ret = setup_affinity(irq, desc, mask); - if (!ret) - irq_set_thread_affinity(desc); raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } From 1082687e8d6292a61759eb83358e7db39fed1bf4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 09:05:05 +0100 Subject: [PATCH 331/706] genirq: Plug race in report_bad_irq() We cannot walk the action chain unlocked. Even if IRQ_INPROGRESS is set an action can be removed and we follow a null pointer. It's safe to take the lock there, because the code which removes the action will call synchronize_irq() which waits unlocked for IRQ_INPROGRESS going away. Signed-off-by: Thomas Gleixner --- kernel/irq/spurious.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 3089d3b9d5f3..2fbfda2716e1 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -139,15 +139,13 @@ static void poll_spurious_irqs(unsigned long dummy) * * (The other 100-of-100,000 interrupts may have been a correctly * functioning device sharing an IRQ with the failing one) - * - * Called under desc->lock */ - static void __report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) { struct irqaction *action; + unsigned long flags; if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) { printk(KERN_ERR "irq event %d: bogus return value %x\n", @@ -159,6 +157,13 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc, dump_stack(); printk(KERN_ERR "handlers:\n"); + /* + * We need to take desc->lock here. note_interrupt() is called + * w/o desc->lock held, but IRQ_PROGRESS set. We might race + * with something else removing an action. It's ok to take + * desc->lock here. See synchronize_irq(). + */ + raw_spin_lock_irqsave(&desc->lock, flags); action = desc->action; while (action) { printk(KERN_ERR "[<%p>]", action->handler); @@ -167,6 +172,7 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc, printk("\n"); action = action->next; } + raw_spin_unlock_irqrestore(&desc->lock, flags); } static void From b738a50a202639614c98b5763b01bf9201779e50 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 2 Feb 2011 23:58:19 +0100 Subject: [PATCH 332/706] genirq: Warn when handler enables interrupts We run all handlers with interrupts disabled and expect them not to enable them. Warn when we catch one who does. Signed-off-by: Thomas Gleixner --- kernel/irq/handle.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 3540a7190122..cdd6fbbe771c 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -68,6 +68,9 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) ret = action->handler(irq, action->dev_id); trace_irq_handler_exit(irq, action, ret); + if (WARN_ON_ONCE(!irqs_disabled())) + local_irq_disable(); + switch (ret) { case IRQ_WAKE_THREAD: /* @@ -114,7 +117,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) if (status & IRQF_SAMPLE_RANDOM) add_interrupt_randomness(irq); - local_irq_disable(); return retval; } From fa27271bc8d230355c1f24ddea103824fdc12de6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 09:10:39 +0100 Subject: [PATCH 333/706] genirq: Fixup poll handling try_one_irq() contains redundant code and lots of useless checks for shared interrupts. Check for shared before setting IRQ_INPROGRESS and then call handle_IRQ_event() while pending. Shorter version with the same functionality. Signed-off-by: Thomas Gleixner --- kernel/irq/spurious.c | 52 +++++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 32 deletions(-) diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 2fbfda2716e1..0af9e59c82eb 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -42,48 +42,36 @@ static int try_one_irq(int irq, struct irq_desc *desc) raw_spin_unlock(&desc->lock); return ok; } + /* + * All handlers must agree on IRQF_SHARED, so we test just the + * first. Check for action->next as well. + */ + action = desc->action; + if (!action || !(action->flags & IRQF_SHARED) || !action->next) + goto out; + /* Honour the normal IRQ locking */ desc->status |= IRQ_INPROGRESS; - action = desc->action; - raw_spin_unlock(&desc->lock); - - while (action) { - /* Only shared IRQ handlers are safe to call */ - if (action->flags & IRQF_SHARED) { - if (action->handler(irq, action->dev_id) == - IRQ_HANDLED) - ok = 1; - } - action = action->next; - } - local_irq_disable(); - /* Now clean up the flags */ - raw_spin_lock(&desc->lock); - action = desc->action; - - /* - * While we were looking for a fixup someone queued a real - * IRQ clashing with our walk: - */ - while ((desc->status & IRQ_PENDING) && action) { - /* - * Perform real IRQ processing for the IRQ we deferred - */ - work = 1; - raw_spin_unlock(&desc->lock); - handle_IRQ_event(irq, action); - raw_spin_lock(&desc->lock); + do { + work++; desc->status &= ~IRQ_PENDING; - } + raw_spin_unlock(&desc->lock); + if (handle_IRQ_event(irq, action) != IRQ_NONE) + ok = 1; + raw_spin_lock(&desc->lock); + action = desc->action; + } while ((desc->status & IRQ_PENDING) && action); + desc->status &= ~IRQ_INPROGRESS; /* * If we did actual work for the real IRQ line we must let the * IRQ controller clean up too */ - if (work) + if (work > 1) irq_end(irq, desc); - raw_spin_unlock(&desc->lock); +out: + raw_spin_unlock(&desc->lock); return ok; } From c7259cd7af757ddcd65701c37099dcddae2054f0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 09:52:27 +0100 Subject: [PATCH 334/706] genirq: Do not poll disabled, percpu and timer interrupts There is no point in polling disabled lines. percpu does not make sense at all because we only poll on the cpu we're currently running on. Also polling per_cpu interrupts is racy as hell. The handler runs without locking so we might get a huge surprise. If the timer interrupt needs polling, then we wont get there anyway. Signed-off-by: Thomas Gleixner --- kernel/irq/spurious.c | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 0af9e59c82eb..bd0e42d3e0ba 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -25,30 +25,42 @@ static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0); /* * Recovery handler for misrouted interrupts. */ -static int try_one_irq(int irq, struct irq_desc *desc) +static int try_one_irq(int irq, struct irq_desc *desc, bool force) { struct irqaction *action; int ok = 0, work = 0; raw_spin_lock(&desc->lock); + + /* PER_CPU and nested thread interrupts are never polled */ + if (desc->status & (IRQ_PER_CPU | IRQ_NESTED_THREAD)) + goto out; + + /* + * Do not poll disabled interrupts unless the spurious + * disabled poller asks explicitely. + */ + if ((desc->status & IRQ_DISABLED) && !force) + goto out; + + /* + * All handlers must agree on IRQF_SHARED, so we test just the + * first. Check for action->next as well. + */ + action = desc->action; + if (!action || !(action->flags & IRQF_SHARED) || + (action->flags & __IRQF_TIMER) || !action->next) + goto out; + /* Already running on another processor */ if (desc->status & IRQ_INPROGRESS) { /* * Already running: If it is shared get the other * CPU to go looking for our mystery interrupt too */ - if (desc->action && (desc->action->flags & IRQF_SHARED)) - desc->status |= IRQ_PENDING; - raw_spin_unlock(&desc->lock); - return ok; - } - /* - * All handlers must agree on IRQF_SHARED, so we test just the - * first. Check for action->next as well. - */ - action = desc->action; - if (!action || !(action->flags & IRQF_SHARED) || !action->next) + desc->status |= IRQ_PENDING; goto out; + } /* Honour the normal IRQ locking */ desc->status |= IRQ_INPROGRESS; @@ -87,7 +99,7 @@ static int misrouted_irq(int irq) if (i == irq) /* Already tried */ continue; - if (try_one_irq(i, desc)) + if (try_one_irq(i, desc, false)) ok = 1; } /* So the caller can adjust the irq error counts */ @@ -112,7 +124,7 @@ static void poll_spurious_irqs(unsigned long dummy) continue; local_irq_disable(); - try_one_irq(i, desc); + try_one_irq(i, desc, true); local_irq_enable(); } From d05c65fff0ef672be75429266751f0e015b54d94 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 14:31:37 +0100 Subject: [PATCH 335/706] genirq: spurious: Run only one poller at a time No point in running concurrent pollers which confuse each other by setting PENDING. Signed-off-by: Thomas Gleixner --- kernel/irq/spurious.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index bd0e42d3e0ba..56ff8fffb8b0 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -21,6 +21,8 @@ static int irqfixup __read_mostly; #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) static void poll_spurious_irqs(unsigned long dummy); static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0); +static int irq_poll_cpu; +static atomic_t irq_poll_active; /* * Recovery handler for misrouted interrupts. @@ -92,6 +94,11 @@ static int misrouted_irq(int irq) struct irq_desc *desc; int i, ok = 0; + if (atomic_inc_return(&irq_poll_active) == 1) + goto out; + + irq_poll_cpu = smp_processor_id(); + for_each_irq_desc(i, desc) { if (!i) continue; @@ -102,6 +109,8 @@ static int misrouted_irq(int irq) if (try_one_irq(i, desc, false)) ok = 1; } +out: + atomic_dec(&irq_poll_active); /* So the caller can adjust the irq error counts */ return ok; } @@ -111,6 +120,10 @@ static void poll_spurious_irqs(unsigned long dummy) struct irq_desc *desc; int i; + if (atomic_inc_return(&irq_poll_active) != 1) + goto out; + irq_poll_cpu = smp_processor_id(); + for_each_irq_desc(i, desc) { unsigned int status; @@ -127,7 +140,8 @@ static void poll_spurious_irqs(unsigned long dummy) try_one_irq(i, desc, true); local_irq_enable(); } - +out: + atomic_dec(&irq_poll_active); mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL); } From fe200ae48ef5c79bf7941fe8046ff9505c570ff6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 10:34:30 +0100 Subject: [PATCH 336/706] genirq: Mark polled irqs and defer the real handler With the chip.end() function gone we might run into a situation where a poll call runs and the real interrupt comes in, sees IRQ_INPROGRESS and disables the line. That might be a perfect working one, which will then be masked forever. So mark them polled while the poll runs. When the real handler sees IRQ_INPROGRESS it checks the poll flag and waits for the polling to complete. Add the necessary amount of sanity checks to it to avoid deadlocks. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 1 + kernel/irq/chip.c | 26 ++++++++++++++++----- kernel/irq/internals.h | 11 +-------- kernel/irq/spurious.c | 51 ++++++++++++++++++++++++++++++++---------- 4 files changed, 61 insertions(+), 28 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index f5e900309d21..e32b64ccdc89 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -71,6 +71,7 @@ typedef void (*irq_flow_handler_t)(unsigned int irq, #define IRQ_SUSPENDED 0x04000000 /* IRQ has gone through suspend sequence */ #define IRQ_ONESHOT 0x08000000 /* IRQ is not unmasked after hardirq */ #define IRQ_NESTED_THREAD 0x10000000 /* IRQ is nested into another, no own handler thread */ +#define IRQ_POLL_INPROGRESS 0x20000000 /* IRQ poll is in progress */ #define IRQF_MODIFY_MASK \ (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \ diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 622b55ac0e09..31258782742c 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -448,6 +448,13 @@ out_unlock: } EXPORT_SYMBOL_GPL(handle_nested_irq); +static bool irq_check_poll(struct irq_desc *desc) +{ + if (!(desc->status & IRQ_POLL_INPROGRESS)) + return false; + return irq_wait_for_poll(desc); +} + /** * handle_simple_irq - Simple and software-decoded IRQs. * @irq: the interrupt number @@ -469,7 +476,9 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) raw_spin_lock(&desc->lock); if (unlikely(desc->status & IRQ_INPROGRESS)) - goto out_unlock; + if (!irq_check_poll(desc)) + goto out_unlock; + desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); kstat_incr_irqs_this_cpu(irq, desc); @@ -510,7 +519,9 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) mask_ack_irq(desc); if (unlikely(desc->status & IRQ_INPROGRESS)) - goto out_unlock; + if (!irq_check_poll(desc)) + goto out_unlock; + desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); kstat_incr_irqs_this_cpu(irq, desc); @@ -558,7 +569,8 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) raw_spin_lock(&desc->lock); if (unlikely(desc->status & IRQ_INPROGRESS)) - goto out; + if (!irq_check_poll(desc)) + goto out; desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); kstat_incr_irqs_this_cpu(irq, desc); @@ -620,9 +632,11 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) */ if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || !desc->action)) { - desc->status |= (IRQ_PENDING | IRQ_MASKED); - mask_ack_irq(desc); - goto out_unlock; + if (!irq_check_poll(desc)) { + desc->status |= (IRQ_PENDING | IRQ_MASKED); + mask_ack_irq(desc); + goto out_unlock; + } } kstat_incr_irqs_this_cpu(irq, desc); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index b5bfa24aa6a6..0eff7e92b1a9 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -28,6 +28,7 @@ extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); /* Resending of interrupts :*/ void check_irq_resend(struct irq_desc *desc, unsigned int irq); +bool irq_wait_for_poll(struct irq_desc *desc); #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); @@ -47,16 +48,6 @@ extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); extern void irq_set_thread_affinity(struct irq_desc *desc); -#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED -static inline void irq_end(unsigned int irq, struct irq_desc *desc) -{ - if (desc->irq_data.chip && desc->irq_data.chip->end) - desc->irq_data.chip->end(irq); -} -#else -static inline void irq_end(unsigned int irq, struct irq_desc *desc) { } -#endif - /* Inline functions for support of irq chips on slow busses */ static inline void chip_bus_lock(struct irq_desc *desc) { diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 56ff8fffb8b0..f749d29bfd81 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -24,13 +24,45 @@ static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0); static int irq_poll_cpu; static atomic_t irq_poll_active; +/* + * We wait here for a poller to finish. + * + * If the poll runs on this CPU, then we yell loudly and return + * false. That will leave the interrupt line disabled in the worst + * case, but it should never happen. + * + * We wait until the poller is done and then recheck disabled and + * action (about to be disabled). Only if it's still active, we return + * true and let the handler run. + */ +bool irq_wait_for_poll(struct irq_desc *desc) +{ + if (WARN_ONCE(irq_poll_cpu == smp_processor_id(), + "irq poll in progress on cpu %d for irq %d\n", + smp_processor_id(), desc->irq_data.irq)) + return false; + +#ifdef CONFIG_SMP + do { + raw_spin_unlock(&desc->lock); + while (desc->status & IRQ_INPROGRESS) + cpu_relax(); + raw_spin_lock(&desc->lock); + } while (desc->status & IRQ_INPROGRESS); + /* Might have been disabled in meantime */ + return !(desc->status & IRQ_DISABLED) && desc->action; +#else + return false; +#endif +} + /* * Recovery handler for misrouted interrupts. */ static int try_one_irq(int irq, struct irq_desc *desc, bool force) { struct irqaction *action; - int ok = 0, work = 0; + int ok = 0; raw_spin_lock(&desc->lock); @@ -64,10 +96,9 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) goto out; } - /* Honour the normal IRQ locking */ - desc->status |= IRQ_INPROGRESS; + /* Honour the normal IRQ locking and mark it poll in progress */ + desc->status |= IRQ_INPROGRESS | IRQ_POLL_INPROGRESS; do { - work++; desc->status &= ~IRQ_PENDING; raw_spin_unlock(&desc->lock); if (handle_IRQ_event(irq, action) != IRQ_NONE) @@ -76,14 +107,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) action = desc->action; } while ((desc->status & IRQ_PENDING) && action); - desc->status &= ~IRQ_INPROGRESS; - /* - * If we did actual work for the real IRQ line we must let the - * IRQ controller clean up too - */ - if (work > 1) - irq_end(irq, desc); - + desc->status &= ~(IRQ_INPROGRESS | IRQ_POLL_INPROGRESS); out: raw_spin_unlock(&desc->lock); return ok; @@ -238,6 +262,9 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc, void note_interrupt(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) { + if (desc->status & IRQ_POLL_INPROGRESS) + return; + if (unlikely(action_ret != IRQ_HANDLED)) { /* * If we are seeing only the odd spurious IRQ caused by From 1535dfacbf21c4da1b73fcf07c39913da5bd5581 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 01:55:43 +0100 Subject: [PATCH 337/706] genirq: Move irq thread flags to core Soleley used in core code. Signed-off-by: Thomas Gleixner --- include/linux/interrupt.h | 14 -------------- kernel/irq/internals.h | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 7834726dd95b..de97b958f478 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -73,20 +73,6 @@ #define IRQF_TIMER (__IRQF_TIMER | IRQF_NO_SUSPEND) -/* - * Bits used by threaded handlers: - * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run - * IRQTF_DIED - handler thread died - * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed - * IRQTF_AFFINITY - irq thread is requested to adjust affinity - */ -enum { - IRQTF_RUNTHREAD, - IRQTF_DIED, - IRQTF_WARNED, - IRQTF_AFFINITY, -}; - /* * These values can be returned by request_any_context_irq() and * describe the context the interrupt will be run in. diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 0eff7e92b1a9..b17c98440400 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -11,6 +11,20 @@ extern int noirqdebug; +/* + * Bits used by threaded handlers: + * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run + * IRQTF_DIED - handler thread died + * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed + * IRQTF_AFFINITY - irq thread is requested to adjust affinity + */ +enum { + IRQTF_RUNTHREAD, + IRQTF_DIED, + IRQTF_WARNED, + IRQTF_AFFINITY, +}; + #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) /* Set default functions for irq_chip structures: */ From 3b56f0585fd4c02d047dc406668cb40159b2d340 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 2 Feb 2011 21:41:12 +0000 Subject: [PATCH 338/706] genirq: Remove bogus conditional The if (chip->irq_shutdown) check will always evaluate to true, as we fill in chip->irq_shutdown with default_shutdown in irq_chip_set_defaults() if the chip does not provide its own function. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20110202212551.667607458@linutronix.de> --- kernel/irq/manage.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 33a6ee0ac68f..30bc8de40905 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1057,10 +1057,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) /* If this was the last handler, shut down the IRQ line: */ if (!desc->action) { desc->status |= IRQ_DISABLED; - if (desc->irq_data.chip->irq_shutdown) - desc->irq_data.chip->irq_shutdown(&desc->irq_data); - else - desc->irq_data.chip->irq_disable(&desc->irq_data); + desc->irq_data.chip->irq_shutdown(&desc->irq_data); } #ifdef CONFIG_SMP From 4699923861513671d3f6ade8efb4e56a9a7ecadf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 2 Feb 2011 21:41:14 +0000 Subject: [PATCH 339/706] genirq: Consolidate startup/shutdown of interrupts Aside of duplicated code some of the startup/shutdown sites do not handle the MASKED/DISABLED flags and the depth field at all. Move that to a helper function and take care of it there. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20110202212551.787481468@linutronix.de> --- kernel/irq/autoprobe.c | 10 +++++----- kernel/irq/chip.c | 37 ++++++++++++++++++++----------------- kernel/irq/internals.h | 3 +++ kernel/irq/manage.c | 14 +++++--------- 4 files changed, 33 insertions(+), 31 deletions(-) diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 505798f86c36..08947cb61725 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -60,7 +60,7 @@ unsigned long probe_irq_on(void) if (desc->irq_data.chip->irq_set_type) desc->irq_data.chip->irq_set_type(&desc->irq_data, IRQ_TYPE_PROBE); - desc->irq_data.chip->irq_startup(&desc->irq_data); + irq_startup(desc); } raw_spin_unlock_irq(&desc->lock); } @@ -77,7 +77,7 @@ unsigned long probe_irq_on(void) raw_spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { desc->status |= IRQ_AUTODETECT | IRQ_WAITING; - if (desc->irq_data.chip->irq_startup(&desc->irq_data)) + if (irq_startup(desc)) desc->status |= IRQ_PENDING; } raw_spin_unlock_irq(&desc->lock); @@ -99,7 +99,7 @@ unsigned long probe_irq_on(void) /* It triggered already - consider it spurious. */ if (!(status & IRQ_WAITING)) { desc->status = status & ~IRQ_AUTODETECT; - desc->irq_data.chip->irq_shutdown(&desc->irq_data); + irq_shutdown(desc); } else if (i < 32) mask |= 1 << i; @@ -138,7 +138,7 @@ unsigned int probe_irq_mask(unsigned long val) mask |= 1 << i; desc->status = status & ~IRQ_AUTODETECT; - desc->irq_data.chip->irq_shutdown(&desc->irq_data); + irq_shutdown(desc); } raw_spin_unlock_irq(&desc->lock); } @@ -182,7 +182,7 @@ int probe_irq_off(unsigned long val) nr_of_irqs++; } desc->status = status & ~IRQ_AUTODETECT; - desc->irq_data.chip->irq_shutdown(&desc->irq_data); + irq_shutdown(desc); } raw_spin_unlock_irq(&desc->lock); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 31258782742c..988fe7a24282 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -192,6 +192,25 @@ void set_irq_nested_thread(unsigned int irq, int nest) } EXPORT_SYMBOL_GPL(set_irq_nested_thread); +int irq_startup(struct irq_desc *desc) +{ + desc->status &= ~(IRQ_MASKED | IRQ_DISABLED); + desc->depth = 0; + + if (desc->irq_data.chip->irq_startup) + return desc->irq_data.chip->irq_startup(&desc->irq_data); + + desc->irq_data.chip->irq_enable(&desc->irq_data); + return 0; +} + +void irq_shutdown(struct irq_desc *desc) +{ + desc->status |= IRQ_MASKED | IRQ_DISABLED; + desc->depth = 1; + desc->irq_data.chip->irq_shutdown(&desc->irq_data); +} + /* * default enable function */ @@ -210,17 +229,6 @@ static void default_disable(struct irq_data *data) { } -/* - * default startup function - */ -static unsigned int default_startup(struct irq_data *data) -{ - struct irq_desc *desc = irq_data_to_desc(data); - - desc->irq_data.chip->irq_enable(data); - return 0; -} - /* * default shutdown function */ @@ -229,7 +237,6 @@ static void default_shutdown(struct irq_data *data) struct irq_desc *desc = irq_data_to_desc(data); desc->irq_data.chip->irq_mask(&desc->irq_data); - desc->status |= IRQ_MASKED; } #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED @@ -337,8 +344,6 @@ void irq_chip_set_defaults(struct irq_chip *chip) chip->irq_enable = default_enable; if (!chip->irq_disable) chip->irq_disable = default_disable; - if (!chip->irq_startup) - chip->irq_startup = default_startup; /* * We use chip->irq_disable, when the user provided its own. When * we have default_disable set for chip->irq_disable, then we need @@ -747,10 +752,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, desc->name = name; if (handle != handle_bad_irq && is_chained) { - desc->status &= ~IRQ_DISABLED; desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; - desc->depth = 0; - desc->irq_data.chip->irq_startup(&desc->irq_data); + irq_startup(desc); } raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(desc); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index b17c98440400..5cbfc93ed7b1 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -38,6 +38,9 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); +extern int irq_startup(struct irq_desc *desc); +extern void irq_shutdown(struct irq_desc *desc); + extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); /* Resending of interrupts :*/ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 30bc8de40905..9c562477e28b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -906,11 +906,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (new->flags & IRQF_ONESHOT) desc->status |= IRQ_ONESHOT; - if (!(desc->status & IRQ_NOAUTOEN)) { - desc->depth = 0; - desc->status &= ~IRQ_DISABLED; - desc->irq_data.chip->irq_startup(&desc->irq_data); - } else + if (!(desc->status & IRQ_NOAUTOEN)) + irq_startup(desc); + else /* Undo nested disables: */ desc->depth = 1; @@ -1055,10 +1053,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) #endif /* If this was the last handler, shut down the IRQ line: */ - if (!desc->action) { - desc->status |= IRQ_DISABLED; - desc->irq_data.chip->irq_shutdown(&desc->irq_data); - } + if (!desc->action) + irq_shutdown(desc); #ifdef CONFIG_SMP /* make sure affinity_hint is cleaned up */ From 87923470c712dff00b101ffb6b6fbc27bd7a6df5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 3 Feb 2011 12:27:44 +0100 Subject: [PATCH 340/706] genirq: Consolidate disable/enable Create irq_disable/enable and use them to keep the flags consistent. Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 12 +++++++++++- kernel/irq/internals.h | 2 ++ kernel/irq/manage.c | 2 +- kernel/irq/resend.c | 10 +++++----- kernel/irq/spurious.c | 2 +- 5 files changed, 20 insertions(+), 8 deletions(-) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 988fe7a24282..86c8e42f7fe4 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -200,7 +200,7 @@ int irq_startup(struct irq_desc *desc) if (desc->irq_data.chip->irq_startup) return desc->irq_data.chip->irq_startup(&desc->irq_data); - desc->irq_data.chip->irq_enable(&desc->irq_data); + irq_enable(desc); return 0; } @@ -211,6 +211,16 @@ void irq_shutdown(struct irq_desc *desc) desc->irq_data.chip->irq_shutdown(&desc->irq_data); } +void irq_enable(struct irq_desc *desc) +{ + desc->irq_data.chip->irq_enable(&desc->irq_data); +} + +void irq_disable(struct irq_desc *desc) +{ + desc->irq_data.chip->irq_disable(&desc->irq_data); +} + /* * default enable function */ diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 5cbfc93ed7b1..c71fc4de0371 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -40,6 +40,8 @@ extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); extern int irq_startup(struct irq_desc *desc); extern void irq_shutdown(struct irq_desc *desc); +extern void irq_enable(struct irq_desc *desc); +extern void irq_disable(struct irq_desc *desc); extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 9c562477e28b..007802399697 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -331,7 +331,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) if (!desc->depth++) { desc->status |= IRQ_DISABLED; - desc->irq_data.chip->irq_disable(&desc->irq_data); + irq_disable(desc); } } diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index dc49358b73fa..4bfe268dffe5 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -55,20 +55,20 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); */ void check_irq_resend(struct irq_desc *desc, unsigned int irq) { - unsigned int status = desc->status; - /* * Make sure the interrupt is enabled, before resending it: */ - desc->irq_data.chip->irq_enable(&desc->irq_data); + irq_enable(desc); /* * We do not resend level type interrupts. Level type * interrupts are resent by hardware when they are still * active. */ - if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { - desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; + if (desc->status & IRQ_LEVEL) + return; + if ((desc->status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { + desc->status = (desc->status & ~IRQ_PENDING) | IRQ_REPLAY; if (!desc->irq_data.chip->irq_retrigger || !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index f749d29bfd81..c300b8f6008d 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -303,7 +303,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, printk(KERN_EMERG "Disabling IRQ #%d\n", irq); desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; desc->depth++; - desc->irq_data.chip->irq_disable(&desc->irq_data); + irq_disable(desc); mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL); From 50f7c0327513d5acefbe26fd33498af18d1ffac5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 3 Feb 2011 13:23:54 +0100 Subject: [PATCH 341/706] genirq: Remove default magic Now that everything uses the wrappers, we can remove the default functions. None of those functions is performance critical. That makes the IRQ_MASKED flag tracking fully consistent. Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 73 ++++++++++----------------------------------- kernel/irq/manage.c | 4 +-- 2 files changed, 17 insertions(+), 60 deletions(-) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 86c8e42f7fe4..1a239a83f925 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -208,45 +208,29 @@ void irq_shutdown(struct irq_desc *desc) { desc->status |= IRQ_MASKED | IRQ_DISABLED; desc->depth = 1; - desc->irq_data.chip->irq_shutdown(&desc->irq_data); + if (desc->irq_data.chip->irq_shutdown) + desc->irq_data.chip->irq_shutdown(&desc->irq_data); + if (desc->irq_data.chip->irq_disable) + desc->irq_data.chip->irq_disable(&desc->irq_data); + else + desc->irq_data.chip->irq_mask(&desc->irq_data); } void irq_enable(struct irq_desc *desc) { - desc->irq_data.chip->irq_enable(&desc->irq_data); + if (desc->irq_data.chip->irq_enable) + desc->irq_data.chip->irq_enable(&desc->irq_data); + else + desc->irq_data.chip->irq_unmask(&desc->irq_data); + desc->status &= ~IRQ_MASKED; } void irq_disable(struct irq_desc *desc) { - desc->irq_data.chip->irq_disable(&desc->irq_data); -} - -/* - * default enable function - */ -static void default_enable(struct irq_data *data) -{ - struct irq_desc *desc = irq_data_to_desc(data); - - desc->irq_data.chip->irq_unmask(&desc->irq_data); - desc->status &= ~IRQ_MASKED; -} - -/* - * default disable function - */ -static void default_disable(struct irq_data *data) -{ -} - -/* - * default shutdown function - */ -static void default_shutdown(struct irq_data *data) -{ - struct irq_desc *desc = irq_data_to_desc(data); - - desc->irq_data.chip->irq_mask(&desc->irq_data); + if (desc->irq_data.chip->irq_disable) { + desc->irq_data.chip->irq_disable(&desc->irq_data); + desc->status |= IRQ_MASKED; + } } #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED @@ -334,10 +318,6 @@ static void compat_bus_sync_unlock(struct irq_data *data) void irq_chip_set_defaults(struct irq_chip *chip) { #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED - /* - * Compat fixup functions need to be before we set the - * defaults for enable/disable/startup/shutdown - */ if (chip->enable) chip->irq_enable = compat_irq_enable; if (chip->disable) @@ -346,31 +326,8 @@ void irq_chip_set_defaults(struct irq_chip *chip) chip->irq_shutdown = compat_irq_shutdown; if (chip->startup) chip->irq_startup = compat_irq_startup; -#endif - /* - * The real defaults - */ - if (!chip->irq_enable) - chip->irq_enable = default_enable; - if (!chip->irq_disable) - chip->irq_disable = default_disable; - /* - * We use chip->irq_disable, when the user provided its own. When - * we have default_disable set for chip->irq_disable, then we need - * to use default_shutdown, otherwise the irq line is not - * disabled on free_irq(): - */ - if (!chip->irq_shutdown) - chip->irq_shutdown = chip->irq_disable != default_disable ? - chip->irq_disable : default_shutdown; - -#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED if (!chip->end) chip->end = dummy_irq_chip.end; - - /* - * Now fix up the remaining compat handlers - */ if (chip->bus_lock) chip->irq_bus_lock = compat_bus_lock; if (chip->bus_sync_unlock) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 007802399697..2a6c6ee11a3f 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -440,8 +440,8 @@ void enable_irq(unsigned int irq) if (!desc) return; - if (WARN(!desc->irq_data.chip || !desc->irq_data.chip->irq_enable, - KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) + if (WARN(!desc->irq_data.chip, + KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) return; chip_bus_lock(desc); From 3aae994fb0f43f6d94a31c33536a83869504abdf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 4 Feb 2011 10:17:52 +0100 Subject: [PATCH 342/706] genirq: Consolidate IRQ_DISABLED Handle IRQ_DISABLED consistent. Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 14 ++++++++++---- kernel/irq/manage.c | 9 +++------ kernel/irq/resend.c | 5 ----- kernel/irq/spurious.c | 2 +- 4 files changed, 14 insertions(+), 16 deletions(-) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 1a239a83f925..43c62ca68c11 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -194,11 +194,14 @@ EXPORT_SYMBOL_GPL(set_irq_nested_thread); int irq_startup(struct irq_desc *desc) { - desc->status &= ~(IRQ_MASKED | IRQ_DISABLED); + desc->status &= ~IRQ_DISABLED; desc->depth = 0; - if (desc->irq_data.chip->irq_startup) - return desc->irq_data.chip->irq_startup(&desc->irq_data); + if (desc->irq_data.chip->irq_startup) { + int ret = desc->irq_data.chip->irq_startup(&desc->irq_data); + desc->status &= ~IRQ_MASKED; + return ret; + } irq_enable(desc); return 0; @@ -206,7 +209,7 @@ int irq_startup(struct irq_desc *desc) void irq_shutdown(struct irq_desc *desc) { - desc->status |= IRQ_MASKED | IRQ_DISABLED; + desc->status |= IRQ_DISABLED; desc->depth = 1; if (desc->irq_data.chip->irq_shutdown) desc->irq_data.chip->irq_shutdown(&desc->irq_data); @@ -214,10 +217,12 @@ void irq_shutdown(struct irq_desc *desc) desc->irq_data.chip->irq_disable(&desc->irq_data); else desc->irq_data.chip->irq_mask(&desc->irq_data); + desc->status |= IRQ_MASKED; } void irq_enable(struct irq_desc *desc) { + desc->status &= ~IRQ_DISABLED; if (desc->irq_data.chip->irq_enable) desc->irq_data.chip->irq_enable(&desc->irq_data); else @@ -227,6 +232,7 @@ void irq_enable(struct irq_desc *desc) void irq_disable(struct irq_desc *desc) { + desc->status |= IRQ_DISABLED; if (desc->irq_data.chip->irq_disable) { desc->irq_data.chip->irq_disable(&desc->irq_data); desc->status |= IRQ_MASKED; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 2a6c6ee11a3f..78a566a9b39f 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -329,10 +329,8 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) desc->status |= IRQ_SUSPENDED; } - if (!desc->depth++) { - desc->status |= IRQ_DISABLED; + if (!desc->depth++) irq_disable(desc); - } } /** @@ -407,12 +405,11 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); break; case 1: { - unsigned int status = desc->status & ~IRQ_DISABLED; - if (desc->status & IRQ_SUSPENDED) goto err_out; /* Prevent probing on this irq: */ - desc->status = status | IRQ_NOPROBE; + desc->status |= IRQ_NOPROBE; + irq_enable(desc); check_irq_resend(desc, irq); /* fall-through */ } diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 4bfe268dffe5..60b20261041b 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -55,11 +55,6 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); */ void check_irq_resend(struct irq_desc *desc, unsigned int irq) { - /* - * Make sure the interrupt is enabled, before resending it: - */ - irq_enable(desc); - /* * We do not resend level type interrupts. Level type * interrupts are resent by hardware when they are still diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index c300b8f6008d..89e5e16aca39 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -301,7 +301,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, * Now kill the IRQ */ printk(KERN_EMERG "Disabling IRQ #%d\n", irq); - desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; + desc->status |= IRQ_SPURIOUS_DISABLED; desc->depth++; irq_disable(desc); From d78f8dd36b90626106ce19cb2e6828b0dc39447e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 2 Feb 2011 21:41:17 +0000 Subject: [PATCH 343/706] genirq: Do not fiddle with IRQ_MASKED in handle_edge_irq() IRQ_MASKED is set in mask_ack_irq() anyway. Remove it from handle_edge_irq() to allow simpler ab^HHreuse of that function. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20110202212551.918484270@linutronix.de> --- kernel/irq/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 43c62ca68c11..2c30b7844595 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -611,7 +611,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || !desc->action)) { if (!irq_check_poll(desc)) { - desc->status |= (IRQ_PENDING | IRQ_MASKED); + desc->status |= IRQ_PENDING; mask_ack_irq(desc); goto out_unlock; } From 4912609f228da4a3d2bfbdf0f31de3d9eab2b7f8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 01:08:49 +0100 Subject: [PATCH 344/706] genirq: Implement handle_irq_event() Core code replacement for the ugly camel case. It contains all the code which is shared in all handlers. clear status flags set INPROGRESS flag unlock call action chain note_interrupt lock clr INPROGRESS flag Signed-off-by: Thomas Gleixner --- kernel/irq/handle.c | 47 +++++++++++++++++++++++++++++++++++------- kernel/irq/internals.h | 3 +++ 2 files changed, 42 insertions(+), 8 deletions(-) diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index cdd6fbbe771c..4ef059478ebf 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -51,14 +51,7 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action) "but no thread function available.", irq, action->name); } -/** - * handle_IRQ_event - irq action chain handler - * @irq: the interrupt number - * @action: the interrupt action chain for this irq - * - * Handles the action chain of an irq event - */ -irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) +static irqreturn_t __handle_irq_event(unsigned int irq, struct irqaction *action) { irqreturn_t ret, retval = IRQ_NONE; unsigned int status = 0; @@ -120,3 +113,41 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) return retval; } + +irqreturn_t +handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) +{ + irqreturn_t ret = __handle_irq_event(desc->irq_data.irq, action); + + if (!noirqdebug) + note_interrupt(desc->irq_data.irq, desc, ret); + return ret; +} + +irqreturn_t handle_irq_event(struct irq_desc *desc) +{ + struct irqaction *action = desc->action; + irqreturn_t ret; + + desc->status &= ~IRQ_PENDING; + desc->status |= IRQ_INPROGRESS; + raw_spin_unlock(&desc->lock); + + ret = handle_irq_event_percpu(desc, action); + + raw_spin_lock(&desc->lock); + desc->status &= ~IRQ_INPROGRESS; + return ret; +} + +/** + * handle_IRQ_event - irq action chain handler + * @irq: the interrupt number + * @action: the interrupt action chain for this irq + * + * Handles the action chain of an irq event + */ +irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) +{ + return __handle_irq_event(irq, action); +} diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index c71fc4de0371..b61824cdadc6 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -45,6 +45,9 @@ extern void irq_disable(struct irq_desc *desc); extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); +irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action); +irqreturn_t handle_irq_event(struct irq_desc *desc); + /* Resending of interrupts :*/ void check_irq_resend(struct irq_desc *desc, unsigned int irq); bool irq_wait_for_poll(struct irq_desc *desc); From 107781e72192067b95a7d373bfa460434a13c6ae Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 01:21:02 +0100 Subject: [PATCH 345/706] genirq: Use handle_irq_event() in handle_simple_irq() Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 2c30b7844595..809a03fe7e07 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -448,9 +448,6 @@ static bool irq_check_poll(struct irq_desc *desc) void handle_simple_irq(unsigned int irq, struct irq_desc *desc) { - struct irqaction *action; - irqreturn_t action_ret; - raw_spin_lock(&desc->lock); if (unlikely(desc->status & IRQ_INPROGRESS)) @@ -460,19 +457,11 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); kstat_incr_irqs_this_cpu(irq, desc); - action = desc->action; - if (unlikely(!action || (desc->status & IRQ_DISABLED))) + if (unlikely(!desc->action || (desc->status & IRQ_DISABLED))) goto out_unlock; - desc->status |= IRQ_INPROGRESS; - raw_spin_unlock(&desc->lock); + handle_irq_event(desc); - action_ret = handle_IRQ_event(irq, action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - - raw_spin_lock(&desc->lock); - desc->status &= ~IRQ_INPROGRESS; out_unlock: raw_spin_unlock(&desc->lock); } From 1529866c63d789925de9b4250646d82d033e4b95 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 01:22:17 +0100 Subject: [PATCH 346/706] genirq: Use handle_irq_event() in handle_level_irq() Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 809a03fe7e07..2d2ba4ace0ec 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -479,9 +479,6 @@ out_unlock: void handle_level_irq(unsigned int irq, struct irq_desc *desc) { - struct irqaction *action; - irqreturn_t action_ret; - raw_spin_lock(&desc->lock); mask_ack_irq(desc); @@ -496,19 +493,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) * If its disabled or no action available * keep it masked and get out of here */ - action = desc->action; - if (unlikely(!action || (desc->status & IRQ_DISABLED))) + if (unlikely(!desc->action || (desc->status & IRQ_DISABLED))) goto out_unlock; - desc->status |= IRQ_INPROGRESS; - raw_spin_unlock(&desc->lock); - - action_ret = handle_IRQ_event(irq, action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - - raw_spin_lock(&desc->lock); - desc->status &= ~IRQ_INPROGRESS; + handle_irq_event(desc); if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT))) unmask_irq(desc); From a7ae4de5c8ae8110556f0f9c7241093ef984605c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 01:23:07 +0100 Subject: [PATCH 347/706] genirq: Use handle_irq_event() in handle_fasteoi_irq() Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 2d2ba4ace0ec..a499ca5b11aa 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -518,9 +518,6 @@ EXPORT_SYMBOL_GPL(handle_level_irq); void handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) { - struct irqaction *action; - irqreturn_t action_ret; - raw_spin_lock(&desc->lock); if (unlikely(desc->status & IRQ_INPROGRESS)) @@ -534,26 +531,14 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) * If its disabled or no action available * then mask it and get out of here: */ - action = desc->action; - if (unlikely(!action || (desc->status & IRQ_DISABLED))) { + if (unlikely(!desc->action || (desc->status & IRQ_DISABLED))) { desc->status |= IRQ_PENDING; mask_irq(desc); goto out; } - - desc->status |= IRQ_INPROGRESS; - desc->status &= ~IRQ_PENDING; - raw_spin_unlock(&desc->lock); - - action_ret = handle_IRQ_event(irq, action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - - raw_spin_lock(&desc->lock); - desc->status &= ~IRQ_INPROGRESS; + handle_irq_event(desc); out: desc->irq_data.chip->irq_eoi(&desc->irq_data); - raw_spin_unlock(&desc->lock); } From a60a5dc2db3b08b3c2900614c43b1262410c2d8c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 01:24:07 +0100 Subject: [PATCH 348/706] genirq: Use handle_irq_event() in handle_edge_irq() It's safe to drop the IRQ_INPROGRESS flag between action chain walks as we are protected by desc->lock. Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index a499ca5b11aa..3ccff4d55b39 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -583,14 +583,8 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) /* Start handling the irq */ desc->irq_data.chip->irq_ack(&desc->irq_data); - /* Mark the IRQ currently in progress.*/ - desc->status |= IRQ_INPROGRESS; - do { - struct irqaction *action = desc->action; - irqreturn_t action_ret; - - if (unlikely(!action)) { + if (unlikely(!desc->action)) { mask_irq(desc); goto out_unlock; } @@ -606,16 +600,10 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) unmask_irq(desc); } - desc->status &= ~IRQ_PENDING; - raw_spin_unlock(&desc->lock); - action_ret = handle_IRQ_event(irq, action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - raw_spin_lock(&desc->lock); + handle_irq_event(desc); } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); - desc->status &= ~IRQ_INPROGRESS; out_unlock: raw_spin_unlock(&desc->lock); } From 849f061c25f8951d11c7dd88f44950ccde296392 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 01:25:41 +0100 Subject: [PATCH 349/706] genirq: Use handle_perpcu_event() in handle_percpu_irq() Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 3ccff4d55b39..52b10ad7bd59 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -618,19 +618,17 @@ out_unlock: void handle_percpu_irq(unsigned int irq, struct irq_desc *desc) { - irqreturn_t action_ret; + struct irq_chip *chip = get_irq_desc_chip(desc); kstat_incr_irqs_this_cpu(irq, desc); - if (desc->irq_data.chip->irq_ack) - desc->irq_data.chip->irq_ack(&desc->irq_data); + if (chip->irq_ack) + chip->irq_ack(&desc->irq_data); - action_ret = handle_IRQ_event(irq, desc->action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); + handle_irq_event_percpu(desc, desc->action); - if (desc->irq_data.chip->irq_eoi) - desc->irq_data.chip->irq_eoi(&desc->irq_data); + if (chip->irq_eoi) + chip->irq_eoi(&desc->irq_data); } void From 0877d66257082ce86fca8f9826b91870575b272c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 01:29:15 +0100 Subject: [PATCH 350/706] genirq: Use handle_irq_event() in the spurious poll code Signed-off-by: Thomas Gleixner --- kernel/irq/spurious.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 89e5e16aca39..bc0620745d5f 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -56,13 +56,14 @@ bool irq_wait_for_poll(struct irq_desc *desc) #endif } + /* * Recovery handler for misrouted interrupts. */ static int try_one_irq(int irq, struct irq_desc *desc, bool force) { + irqreturn_t ret = IRQ_NONE; struct irqaction *action; - int ok = 0; raw_spin_lock(&desc->lock); @@ -96,21 +97,17 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) goto out; } - /* Honour the normal IRQ locking and mark it poll in progress */ - desc->status |= IRQ_INPROGRESS | IRQ_POLL_INPROGRESS; + /* Mark it poll in progress */ + desc->status |= IRQ_POLL_INPROGRESS; do { - desc->status &= ~IRQ_PENDING; - raw_spin_unlock(&desc->lock); - if (handle_IRQ_event(irq, action) != IRQ_NONE) - ok = 1; - raw_spin_lock(&desc->lock); + if (handle_irq_event(desc) == IRQ_HANDLED) + ret = IRQ_HANDLED; action = desc->action; - } while ((desc->status & IRQ_PENDING) && action); - - desc->status &= ~(IRQ_INPROGRESS | IRQ_POLL_INPROGRESS); + } while ((desc->status & IRQ_PENDING) && action); + desc->status &= ~IRQ_POLL_INPROGRESS; out: raw_spin_unlock(&desc->lock); - return ok; + return ret == IRQ_HANDLED; } static int misrouted_irq(int irq) From 1277a5325adfc53caac7dd3dac5d3d2fd2a125b4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 01:40:27 +0100 Subject: [PATCH 351/706] genirq: Simplify handle_irq_event() Now that all core users are converted one layer can go. Signed-off-by: Thomas Gleixner --- kernel/irq/handle.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 4ef059478ebf..ff40e0f5e2e2 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -51,10 +51,11 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action) "but no thread function available.", irq, action->name); } -static irqreturn_t __handle_irq_event(unsigned int irq, struct irqaction *action) +irqreturn_t +handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) { irqreturn_t ret, retval = IRQ_NONE; - unsigned int status = 0; + unsigned int status = 0, irq = desc->irq_data.irq; do { trace_irq_handler_entry(irq, action); @@ -111,17 +112,9 @@ static irqreturn_t __handle_irq_event(unsigned int irq, struct irqaction *action if (status & IRQF_SAMPLE_RANDOM) add_interrupt_randomness(irq); - return retval; -} - -irqreturn_t -handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) -{ - irqreturn_t ret = __handle_irq_event(desc->irq_data.irq, action); - if (!noirqdebug) - note_interrupt(desc->irq_data.irq, desc, ret); - return ret; + note_interrupt(irq, desc, ret); + return retval; } irqreturn_t handle_irq_event(struct irq_desc *desc) @@ -149,5 +142,5 @@ irqreturn_t handle_irq_event(struct irq_desc *desc) */ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) { - return __handle_irq_event(irq, action); + return handle_irq_event_percpu(irq_to_desc(irq), action); } From c78b9b65faa291def628dbd8539649f58299f0f3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 16 Dec 2010 17:21:47 +0100 Subject: [PATCH 352/706] genirq: Implement generic irq_show_interrupts() All archs implement show_interrupts() in more or less the same way. That's tons of duplicated code with different bugs with no value. Implement a generic version and deprecate show_interrupts() Unfortunately we need some ifdeffery for !GENERIC_HARDIRQ archs. Signed-off-by: Thomas Gleixner --- include/linux/interrupt.h | 1 + kernel/irq/Kconfig | 3 ++ kernel/irq/proc.c | 63 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index de97b958f478..8da6643e39a6 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -671,6 +671,7 @@ static inline void init_irq_proc(void) struct seq_file; int show_interrupts(struct seq_file *p, void *v); +int arch_show_interrupts(struct seq_file *p, int prec); extern int early_irq_init(void); extern int arch_probe_nr_irqs(void); diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 8e42fec7686d..4cd5d7135e0f 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -20,6 +20,9 @@ config HAVE_SPARSE_IRQ config GENERIC_IRQ_PROBE def_bool n +config GENERIC_IRQ_SHOW + def_bool n + config GENERIC_PENDING_IRQ def_bool n diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index a46bd762db47..26449239bb46 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "internals.h" @@ -357,3 +358,65 @@ void init_irq_proc(void) } } +#ifdef CONFIG_GENERIC_IRQ_SHOW + +int __weak arch_show_interrupts(struct seq_file *p, int prec) +{ + return 0; +} + +int show_interrupts(struct seq_file *p, void *v) +{ + static int prec; + + unsigned long flags, any_count = 0; + int i = *(loff_t *) v, j; + struct irqaction *action; + struct irq_desc *desc; + + if (i > nr_irqs) + return 0; + + if (i == nr_irqs) + return arch_show_interrupts(p, prec); + + /* print header and calculate the width of the first column */ + if (i == 0) { + for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec) + j *= 10; + + seq_printf(p, "%*s", prec + 8, ""); + for_each_online_cpu(j) + seq_printf(p, "CPU%-8d", j); + seq_putc(p, '\n'); + } + + desc = irq_to_desc(i); + if (!desc) + return 0; + + raw_spin_lock_irqsave(&desc->lock, flags); + for_each_online_cpu(j) + any_count |= kstat_irqs_cpu(i, j); + action = desc->action; + if (!action && !any_count) + goto out; + + seq_printf(p, "%*d: ", prec, i); + for_each_online_cpu(j) + seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); + seq_printf(p, " %8s", desc->irq_data.chip->name); + seq_printf(p, "-%-8s", desc->name); + + if (action) { + seq_printf(p, " %s", action->name); + while ((action = action->next) != NULL) + seq_printf(p, ", %s", action->name); + } + + seq_putc(p, '\n'); +out: + raw_spin_unlock_irqrestore(&desc->lock, flags); + return 0; +} +#endif From 35e857cbeb24e75c6f9a9312ac30454eee8c5950 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Feb 2011 12:20:23 +0100 Subject: [PATCH 353/706] genirq: Fixup core code namespace fallout Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 6 +++--- kernel/irq/manage.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 52b10ad7bd59..143eb2a9fa4e 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -618,7 +618,7 @@ out_unlock: void handle_percpu_irq(unsigned int irq, struct irq_desc *desc) { - struct irq_chip *chip = get_irq_desc_chip(desc); + struct irq_chip *chip = irq_desc_get_chip(desc); kstat_incr_irqs_this_cpu(irq, desc); @@ -685,7 +685,7 @@ void set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, irq_flow_handler_t handle) { - set_irq_chip(irq, chip); + irq_set_chip(irq, chip); __set_irq_handler(irq, handle, 0, NULL); } @@ -693,7 +693,7 @@ void set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, irq_flow_handler_t handle, const char *name) { - set_irq_chip(irq, chip); + irq_set_chip(irq, chip); __set_irq_handler(irq, handle, 0, name); } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 78a566a9b39f..dd4e5c21b9e7 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -260,7 +260,7 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); static int setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) { - struct irq_chip *chip = get_irq_desc_chip(desc); + struct irq_chip *chip = irq_desc_get_chip(desc); struct cpumask *set = irq_default_affinity; int ret; From dbec07bac614a61e3392c1e7c08cc6a49ad43f7a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 20:19:55 +0100 Subject: [PATCH 354/706] genirq: Add internal state field to irq_desc That field will contain internal state information which is not going to be exposed to anything outside the core code - except via accessor functions. I'm tired of everyone fiddling in irq_desc.status. core_internal_state__do_not_mess_with_it is clear enough, annoying to type and easy to grep for. Offenders will be tracked down and slapped with stinking trouts. Signed-off-by: Thomas Gleixner --- include/linux/irqdesc.h | 3 ++- kernel/irq/internals.h | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 64794dec93b6..782bf9851a9f 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -19,6 +19,7 @@ struct timer_rand_state; * @handle_irq: highlevel irq-events handler [if NULL, __do_IRQ()] * @action: the irq action chain * @status: status information + * @core_internal_state__do_not_mess_with_it: core internal status information * @depth: disable-depth, for nested irq_disable() calls * @wake_depth: enable depth, for multiple set_irq_wake() callers * @irq_count: stats field to detect stalled irqs @@ -63,7 +64,7 @@ struct irq_desc { irq_flow_handler_t handle_irq; struct irqaction *action; /* IRQ action list */ unsigned int status; /* IRQ status */ - + unsigned int core_internal_state__do_not_mess_with_it; unsigned int depth; /* nested irq disables */ unsigned int wake_depth; /* nested wake enables */ unsigned int irq_count; /* For detecting broken IRQs */ diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index b61824cdadc6..ae96e688f4e1 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -1,5 +1,9 @@ /* * IRQ subsystem internal functions and variables: + * + * Do not ever include this file from anything else than + * kernel/irq/. Do not even think about using any information outside + * of this file for your non core code. */ #include @@ -9,6 +13,8 @@ # define IRQ_BITMAP_BITS NR_IRQS #endif +#define istate core_internal_state__do_not_mess_with_it + extern int noirqdebug; /* From e6bea9c404699223322d7411c6f2ceaec02fa83c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Feb 2011 13:16:52 +0100 Subject: [PATCH 355/706] genirq: Protect tglx from tripping over his own feet The irq_desc.status field will either go away or renamed to settings. Anyway we need to maintain compatibility to avoid breaking the world and some more. While moving bits into the core, I need to avoid that I use any of the still existing IRQ_ bits in the core code by typos. So that file will hold the inline wrappers and some nasty CPP tricks to break the build when typoed. Signed-off-by: Thomas Gleixner --- kernel/irq/internals.h | 2 ++ kernel/irq/irqdesc.c | 4 ++-- kernel/irq/settings.h | 7 +++++++ 3 files changed, 11 insertions(+), 2 deletions(-) create mode 100644 kernel/irq/settings.h diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index ae96e688f4e1..8f200310a952 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -13,6 +13,8 @@ # define IRQ_BITMAP_BITS NR_IRQS #endif +#include "settings.h" + #define istate core_internal_state__do_not_mess_with_it extern int noirqdebug; diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 6f6644f819dd..8b87f2ce0203 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -79,7 +79,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) desc->irq_data.chip_data = NULL; desc->irq_data.handler_data = NULL; desc->irq_data.msi_desc = NULL; - desc->status = IRQ_DEFAULT_INIT_FLAGS; + desc->status = _IRQ_DEFAULT_INIT_FLAGS; desc->handle_irq = handle_bad_irq; desc->depth = 1; desc->irq_count = 0; @@ -246,7 +246,7 @@ int __init early_irq_init(void) struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { - .status = IRQ_DEFAULT_INIT_FLAGS, + .status = _IRQ_DEFAULT_INIT_FLAGS, .handle_irq = handle_bad_irq, .depth = 1, .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h new file mode 100644 index 000000000000..610f55597ce7 --- /dev/null +++ b/kernel/irq/settings.h @@ -0,0 +1,7 @@ +/* + * Internal header to deal with irq_desc->status which will be renamed + * to irq_desc->settings. + */ +enum { + _IRQ_DEFAULT_INIT_FLAGS = IRQ_DEFAULT_INIT_FLAGS, +}; From bd062e7667ac173afef57fbfe9327f3b914a9d4c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 20:25:25 +0100 Subject: [PATCH 356/706] genirq: Move IRQ_AUTODETECT to internal state No users outside of core Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 1 - kernel/irq/autoprobe.c | 29 ++++++++++++----------------- kernel/irq/internals.h | 15 +++++++++++++-- kernel/irq/manage.c | 3 ++- 4 files changed, 27 insertions(+), 21 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index e32b64ccdc89..d1f9c352cd1b 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -54,7 +54,6 @@ typedef void (*irq_flow_handler_t)(unsigned int irq, #define IRQ_DISABLED 0x00000200 /* IRQ disabled - do not enter! */ #define IRQ_PENDING 0x00000400 /* IRQ pending - replay on enable */ #define IRQ_REPLAY 0x00000800 /* IRQ has been replayed but not acked yet */ -#define IRQ_AUTODETECT 0x00001000 /* IRQ is being autodetected */ #define IRQ_WAITING 0x00002000 /* IRQ not yet seen - for autodetection */ #define IRQ_LEVEL 0x00004000 /* IRQ level triggered */ #define IRQ_MASKED 0x00008000 /* IRQ masked - shouldn't be seen again */ diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 08947cb61725..916e56e10a2e 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -32,7 +32,6 @@ unsigned long probe_irq_on(void) { struct irq_desc *desc; unsigned long mask = 0; - unsigned int status; int i; /* @@ -76,7 +75,8 @@ unsigned long probe_irq_on(void) for_each_irq_desc_reverse(i, desc) { raw_spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { - desc->status |= IRQ_AUTODETECT | IRQ_WAITING; + desc->istate |= IRQS_AUTODETECT; + desc->status |= IRQ_WAITING; if (irq_startup(desc)) desc->status |= IRQ_PENDING; } @@ -93,12 +93,11 @@ unsigned long probe_irq_on(void) */ for_each_irq_desc(i, desc) { raw_spin_lock_irq(&desc->lock); - status = desc->status; - if (status & IRQ_AUTODETECT) { + if (desc->istate & IRQS_AUTODETECT) { /* It triggered already - consider it spurious. */ - if (!(status & IRQ_WAITING)) { - desc->status = status & ~IRQ_AUTODETECT; + if (!(desc->status & IRQ_WAITING)) { + desc->istate &= ~IRQS_AUTODETECT; irq_shutdown(desc); } else if (i < 32) @@ -125,19 +124,17 @@ EXPORT_SYMBOL(probe_irq_on); */ unsigned int probe_irq_mask(unsigned long val) { - unsigned int status, mask = 0; + unsigned int mask = 0; struct irq_desc *desc; int i; for_each_irq_desc(i, desc) { raw_spin_lock_irq(&desc->lock); - status = desc->status; - - if (status & IRQ_AUTODETECT) { - if (i < 16 && !(status & IRQ_WAITING)) + if (desc->istate & IRQS_AUTODETECT) { + if (i < 16 && !(desc->status & IRQ_WAITING)) mask |= 1 << i; - desc->status = status & ~IRQ_AUTODETECT; + desc->istate &= ~IRQS_AUTODETECT; irq_shutdown(desc); } raw_spin_unlock_irq(&desc->lock); @@ -169,19 +166,17 @@ int probe_irq_off(unsigned long val) { int i, irq_found = 0, nr_of_irqs = 0; struct irq_desc *desc; - unsigned int status; for_each_irq_desc(i, desc) { raw_spin_lock_irq(&desc->lock); - status = desc->status; - if (status & IRQ_AUTODETECT) { - if (!(status & IRQ_WAITING)) { + if (desc->istate & IRQS_AUTODETECT) { + if (!(desc->status & IRQ_WAITING)) { if (!nr_of_irqs) irq_found = i; nr_of_irqs++; } - desc->status = status & ~IRQ_AUTODETECT; + desc->istate &= ~IRQS_AUTODETECT; irq_shutdown(desc); } raw_spin_unlock_irq(&desc->lock); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 8f200310a952..7ffd4f439b92 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -33,6 +33,15 @@ enum { IRQTF_AFFINITY, }; +/* + * Bit masks for desc->state + * + * IRQS_AUTODETECT - autodetection in progress + */ +enum { + IRQS_AUTODETECT = 0x00000001, +}; + #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) /* Set default functions for irq_chip structures: */ @@ -98,6 +107,7 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc) #include #define P(f) if (desc->status & f) printk("%14s set\n", #f) +#define PS(f) if (desc->istate & f) printk("%14s set\n", #f) static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) { @@ -117,7 +127,6 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) P(IRQ_DISABLED); P(IRQ_PENDING); P(IRQ_REPLAY); - P(IRQ_AUTODETECT); P(IRQ_WAITING); P(IRQ_LEVEL); P(IRQ_MASKED); @@ -127,7 +136,9 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) P(IRQ_NOPROBE); P(IRQ_NOREQUEST); P(IRQ_NOAUTOEN); + + PS(IRQS_AUTODETECT); } #undef P - +#undef PS diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index dd4e5c21b9e7..abe852c9449d 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -897,8 +897,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) desc->status |= IRQ_PER_CPU; #endif - desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT | + desc->status &= ~(IRQ_WAITING | IRQ_ONESHOT | IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); + desc->istate &= ~IRQS_AUTODETECT; if (new->flags & IRQF_ONESHOT) desc->status |= IRQ_ONESHOT; From 7acdd53e5b2c55b6f7e3427e85e2f91fa814a4f9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 20:40:54 +0100 Subject: [PATCH 357/706] genirq: Move IRQ_SPURIOUS_DISABLED to core state No users outside. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 1 - kernel/irq/internals.h | 3 +++ kernel/irq/manage.c | 9 ++++----- kernel/irq/spurious.c | 8 ++++---- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index d1f9c352cd1b..a900741b43ea 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -64,7 +64,6 @@ typedef void (*irq_flow_handler_t)(unsigned int irq, #define IRQ_WAKEUP 0x00100000 /* IRQ triggers system wakeup */ #define IRQ_MOVE_PENDING 0x00200000 /* need to re-target IRQ destination */ #define IRQ_NO_BALANCING 0x00400000 /* IRQ is excluded from balancing */ -#define IRQ_SPURIOUS_DISABLED 0x00800000 /* IRQ was disabled by the spurious trap */ #define IRQ_MOVE_PCNTXT 0x01000000 /* IRQ migration from process context */ #define IRQ_AFFINITY_SET 0x02000000 /* IRQ affinity was set from userspace*/ #define IRQ_SUSPENDED 0x04000000 /* IRQ has gone through suspend sequence */ diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 7ffd4f439b92..dc5e21b84f9e 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -37,9 +37,12 @@ enum { * Bit masks for desc->state * * IRQS_AUTODETECT - autodetection in progress + * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt + * detection */ enum { IRQS_AUTODETECT = 0x00000001, + IRQS_SPURIOUS_DISABLED = 0x00000002, }; #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index abe852c9449d..5b918ffa46af 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -897,9 +897,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) desc->status |= IRQ_PER_CPU; #endif - desc->status &= ~(IRQ_WAITING | IRQ_ONESHOT | - IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); - desc->istate &= ~IRQS_AUTODETECT; + desc->status &= ~(IRQ_WAITING | IRQ_ONESHOT | IRQ_INPROGRESS); + desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED); if (new->flags & IRQF_ONESHOT) desc->status |= IRQ_ONESHOT; @@ -937,8 +936,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * Check whether we disabled the irq via the spurious handler * before. Reenable it and give it another chance. */ - if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) { - desc->status &= ~IRQ_SPURIOUS_DISABLED; + if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) { + desc->istate &= ~IRQS_SPURIOUS_DISABLED; __enable_irq(desc, irq, false); } diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index bc0620745d5f..2941d8a22df7 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -146,15 +146,15 @@ static void poll_spurious_irqs(unsigned long dummy) irq_poll_cpu = smp_processor_id(); for_each_irq_desc(i, desc) { - unsigned int status; + unsigned int state; if (!i) continue; /* Racy but it doesn't matter */ - status = desc->status; + state = desc->istate; barrier(); - if (!(status & IRQ_SPURIOUS_DISABLED)) + if (!(state & IRQS_SPURIOUS_DISABLED)) continue; local_irq_disable(); @@ -298,7 +298,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, * Now kill the IRQ */ printk(KERN_EMERG "Disabling IRQ #%d\n", irq); - desc->status |= IRQ_SPURIOUS_DISABLED; + desc->istate |= IRQS_SPURIOUS_DISABLED; desc->depth++; irq_disable(desc); From 6f91a52d9bb28396177662f1da0f2e2cef9cf5d0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 14 Feb 2011 13:33:16 +0100 Subject: [PATCH 358/706] genirq: Use modify_status for set_irq_nested_thread No need for a separate function in the core code. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 16 +++++++++++++--- kernel/irq/chip.c | 28 ---------------------------- 2 files changed, 13 insertions(+), 31 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index a900741b43ea..67b77cfb2a34 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -74,7 +74,7 @@ typedef void (*irq_flow_handler_t)(unsigned int irq, #define IRQF_MODIFY_MASK \ (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \ IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \ - IRQ_PER_CPU) + IRQ_PER_CPU | IRQ_NESTED_THREAD) #ifdef CONFIG_IRQ_PER_CPU # define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU) @@ -307,8 +307,6 @@ set_irq_chained_handler(unsigned int irq, irq_flow_handler_t handle) __set_irq_handler(irq, handle, 1, NULL); } -extern void set_irq_nested_thread(unsigned int irq, int nest); - void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set); static inline void irq_set_status_flags(unsigned int irq, unsigned long set) @@ -331,6 +329,14 @@ static inline void irq_set_probe(unsigned int irq) irq_modify_status(irq, IRQ_NOPROBE, 0); } +static inline void irq_set_nested_thread(unsigned int irq, bool nest) +{ + if (nest) + irq_set_status_flags(irq, IRQ_NESTED_THREAD); + else + irq_clear_status_flags(irq, IRQ_NESTED_THREAD); +} + /* Handle dynamic irq creation and destruction */ extern unsigned int create_irq_nr(unsigned int irq_want, int node); extern int create_irq(void); @@ -448,6 +454,10 @@ static inline void set_irq_probe(unsigned int irq) { irq_set_probe(irq); } +static inline void set_irq_nested_thread(unsigned int irq, int nest) +{ + irq_set_nested_thread(irq, nest); +} #endif int irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 143eb2a9fa4e..bff21f233a02 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -164,34 +164,6 @@ struct irq_data *irq_get_irq_data(unsigned int irq) } EXPORT_SYMBOL_GPL(irq_get_irq_data); -/** - * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq - * - * @irq: Interrupt number - * @nest: 0 to clear / 1 to set the IRQ_NESTED_THREAD flag - * - * The IRQ_NESTED_THREAD flag indicates that on - * request_threaded_irq() no separate interrupt thread should be - * created for the irq as the handler are called nested in the - * context of a demultiplexing interrupt handler thread. - */ -void set_irq_nested_thread(unsigned int irq, int nest) -{ - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - - if (!desc) - return; - - raw_spin_lock_irqsave(&desc->lock, flags); - if (nest) - desc->status |= IRQ_NESTED_THREAD; - else - desc->status &= ~IRQ_NESTED_THREAD; - raw_spin_unlock_irqrestore(&desc->lock, flags); -} -EXPORT_SYMBOL_GPL(set_irq_nested_thread); - int irq_startup(struct irq_desc *desc) { desc->status &= ~IRQ_DISABLED; From 6954b75b488dd740950573f244ddd66fd28620aa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 20:55:35 +0100 Subject: [PATCH 359/706] genirq: Move IRQ_POLL_INPROGRESS to core No users outside of core. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 1 - kernel/irq/chip.c | 2 +- kernel/irq/internals.h | 2 ++ kernel/irq/spurious.c | 6 +++--- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index 67b77cfb2a34..047a695511df 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -69,7 +69,6 @@ typedef void (*irq_flow_handler_t)(unsigned int irq, #define IRQ_SUSPENDED 0x04000000 /* IRQ has gone through suspend sequence */ #define IRQ_ONESHOT 0x08000000 /* IRQ is not unmasked after hardirq */ #define IRQ_NESTED_THREAD 0x10000000 /* IRQ is nested into another, no own handler thread */ -#define IRQ_POLL_INPROGRESS 0x20000000 /* IRQ poll is in progress */ #define IRQF_MODIFY_MASK \ (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \ diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index bff21f233a02..34245e7d1213 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -400,7 +400,7 @@ EXPORT_SYMBOL_GPL(handle_nested_irq); static bool irq_check_poll(struct irq_desc *desc) { - if (!(desc->status & IRQ_POLL_INPROGRESS)) + if (!(desc->istate & IRQS_POLL_INPROGRESS)) return false; return irq_wait_for_poll(desc); } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index dc5e21b84f9e..f5d28e1e1eda 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -39,10 +39,12 @@ enum { * IRQS_AUTODETECT - autodetection in progress * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt * detection + * IRQS_POLL_INPROGRESS - polling in progress */ enum { IRQS_AUTODETECT = 0x00000001, IRQS_SPURIOUS_DISABLED = 0x00000002, + IRQS_POLL_INPROGRESS = 0x00000008, }; #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 2941d8a22df7..21c46178b1a6 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -98,13 +98,13 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) } /* Mark it poll in progress */ - desc->status |= IRQ_POLL_INPROGRESS; + desc->istate |= IRQS_POLL_INPROGRESS; do { if (handle_irq_event(desc) == IRQ_HANDLED) ret = IRQ_HANDLED; action = desc->action; } while ((desc->status & IRQ_PENDING) && action); - desc->status &= ~IRQ_POLL_INPROGRESS; + desc->istate &= ~IRQS_POLL_INPROGRESS; out: raw_spin_unlock(&desc->lock); return ret == IRQ_HANDLED; @@ -259,7 +259,7 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc, void note_interrupt(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) { - if (desc->status & IRQ_POLL_INPROGRESS) + if (desc->istate & IRQS_POLL_INPROGRESS) return; if (unlikely(action_ret != IRQ_HANDLED)) { From 009b4c3b8ad584b3462734127a5bec680d5d6af4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 21:48:49 +0100 Subject: [PATCH 360/706] genirq: Add IRQ_INPROGRESS to core We need to maintain the flag for now in both fields status and istate. Add a CONFIG_GENERIC_HARDIRQS_NO_COMPAT switch to allow testing w/o the status one. Wrap the access to status IRQ_INPROGRESS in a inline which can be turned of with CONFIG_GENERIC_HARDIRQS_NO_COMPAT along with the define. There is no reason that anything outside of core looks at this. That needs some modifications, but we'll get there. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 6 +++++- kernel/irq/Kconfig | 3 +++ kernel/irq/chip.c | 16 +++++++++------- kernel/irq/compat.h | 17 +++++++++++++++++ kernel/irq/handle.c | 6 ++++-- kernel/irq/internals.h | 5 ++++- kernel/irq/manage.c | 17 +++++++++-------- kernel/irq/settings.h | 3 +++ kernel/irq/spurious.c | 6 +++--- 9 files changed, 57 insertions(+), 22 deletions(-) create mode 100644 kernel/irq/compat.h diff --git a/include/linux/irq.h b/include/linux/irq.h index 047a695511df..274590fc55a3 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -50,7 +50,11 @@ typedef void (*irq_flow_handler_t)(unsigned int irq, #define IRQ_TYPE_PROBE 0x00000010 /* Probing in progress */ /* Internal flags */ -#define IRQ_INPROGRESS 0x00000100 /* IRQ handler active - do not enter! */ + +#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT +#define IRQ_INPROGRESS 0x00000100 /* DEPRECATED */ +#endif + #define IRQ_DISABLED 0x00000200 /* IRQ disabled - do not enter! */ #define IRQ_PENDING 0x00000400 /* IRQ pending - replay on enable */ #define IRQ_REPLAY 0x00000800 /* IRQ has been replayed but not acked yet */ diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 4cd5d7135e0f..9e2256de1d1a 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -13,6 +13,9 @@ config GENERIC_HARDIRQS config GENERIC_HARDIRQS_NO_DEPRECATED def_bool n +config GENERIC_HARDIRQS_NO_COMPAT + def_bool n + # Options selectable by the architecture code config HAVE_SPARSE_IRQ def_bool n diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 34245e7d1213..075385549dcd 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -383,7 +383,8 @@ void handle_nested_irq(unsigned int irq) if (unlikely(!action || (desc->status & IRQ_DISABLED))) goto out_unlock; - desc->status |= IRQ_INPROGRESS; + irq_compat_set_progress(desc); + desc->istate |= IRQS_INPROGRESS; raw_spin_unlock_irq(&desc->lock); action_ret = action->thread_fn(action->irq, action->dev_id); @@ -391,7 +392,8 @@ void handle_nested_irq(unsigned int irq) note_interrupt(irq, desc, action_ret); raw_spin_lock_irq(&desc->lock); - desc->status &= ~IRQ_INPROGRESS; + desc->istate &= ~IRQS_INPROGRESS; + irq_compat_clr_progress(desc); out_unlock: raw_spin_unlock_irq(&desc->lock); @@ -422,7 +424,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) { raw_spin_lock(&desc->lock); - if (unlikely(desc->status & IRQ_INPROGRESS)) + if (unlikely(desc->istate & IRQS_INPROGRESS)) if (!irq_check_poll(desc)) goto out_unlock; @@ -454,7 +456,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) raw_spin_lock(&desc->lock); mask_ack_irq(desc); - if (unlikely(desc->status & IRQ_INPROGRESS)) + if (unlikely(desc->istate & IRQS_INPROGRESS)) if (!irq_check_poll(desc)) goto out_unlock; @@ -492,7 +494,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) { raw_spin_lock(&desc->lock); - if (unlikely(desc->status & IRQ_INPROGRESS)) + if (unlikely(desc->istate & IRQS_INPROGRESS)) if (!irq_check_poll(desc)) goto out; @@ -542,8 +544,8 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) * we shouldn't process the IRQ. Mark it pending, handle * the necessary masking and go out */ - if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || - !desc->action)) { + if (unlikely((desc->istate & (IRQS_INPROGRESS) || + (desc->status & IRQ_DISABLED) || !desc->action))) { if (!irq_check_poll(desc)) { desc->status |= IRQ_PENDING; mask_ack_irq(desc); diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h new file mode 100644 index 000000000000..aac6e400e608 --- /dev/null +++ b/kernel/irq/compat.h @@ -0,0 +1,17 @@ +/* + * Compat layer for transition period + */ +#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT +static inline void irq_compat_set_progress(struct irq_desc *desc) +{ + desc->status |= IRQ_INPROGRESS; +} + +static inline void irq_compat_clr_progress(struct irq_desc *desc) +{ + desc->status &= ~IRQ_INPROGRESS; +} +#else +static inline void irq_compat_set_progress(struct irq_desc *desc) { } +static inline void irq_compat_clr_progress(struct irq_desc *desc) { } +#endif diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index ff40e0f5e2e2..d4ae0b1ccc00 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -123,13 +123,15 @@ irqreturn_t handle_irq_event(struct irq_desc *desc) irqreturn_t ret; desc->status &= ~IRQ_PENDING; - desc->status |= IRQ_INPROGRESS; + irq_compat_set_progress(desc); + desc->istate |= IRQS_INPROGRESS; raw_spin_unlock(&desc->lock); ret = handle_irq_event_percpu(desc, action); raw_spin_lock(&desc->lock); - desc->status &= ~IRQ_INPROGRESS; + desc->istate &= ~IRQS_INPROGRESS; + irq_compat_clr_progress(desc); return ret; } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index f5d28e1e1eda..d1cb1f8df6fe 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -13,6 +13,7 @@ # define IRQ_BITMAP_BITS NR_IRQS #endif +#include "compat.h" #include "settings.h" #define istate core_internal_state__do_not_mess_with_it @@ -40,11 +41,13 @@ enum { * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt * detection * IRQS_POLL_INPROGRESS - polling in progress + * IRQS_INPROGRESS - Interrupt in progress */ enum { IRQS_AUTODETECT = 0x00000001, IRQS_SPURIOUS_DISABLED = 0x00000002, IRQS_POLL_INPROGRESS = 0x00000008, + IRQS_INPROGRESS = 0x00000010, }; #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) @@ -128,7 +131,6 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) print_symbol("%s\n", (unsigned long)desc->action->handler); } - P(IRQ_INPROGRESS); P(IRQ_DISABLED); P(IRQ_PENDING); P(IRQ_REPLAY); @@ -143,6 +145,7 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) P(IRQ_NOAUTOEN); PS(IRQS_AUTODETECT); + PS(IRQS_INPROGRESS); } #undef P diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5b918ffa46af..7e5a50825088 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -30,7 +30,7 @@ void synchronize_irq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - unsigned int status; + unsigned int state; if (!desc) return; @@ -42,16 +42,16 @@ void synchronize_irq(unsigned int irq) * Wait until we're out of the critical section. This might * give the wrong answer due to the lack of memory barriers. */ - while (desc->status & IRQ_INPROGRESS) + while (desc->istate & IRQS_INPROGRESS) cpu_relax(); /* Ok, that indicated we're done: double-check carefully. */ raw_spin_lock_irqsave(&desc->lock, flags); - status = desc->status; + state = desc->istate; raw_spin_unlock_irqrestore(&desc->lock, flags); /* Oops, that failed? */ - } while (status & IRQ_INPROGRESS); + } while (state & IRQS_INPROGRESS); /* * We made sure that no hardirq handler is running. Now verify @@ -637,9 +637,9 @@ again: * The thread is faster done than the hard interrupt handler * on the other CPU. If we unmask the irq line then the * interrupt can come in again and masks the line, leaves due - * to IRQ_INPROGRESS and the irq line is masked forever. + * to IRQS_INPROGRESS and the irq line is masked forever. */ - if (unlikely(desc->status & IRQ_INPROGRESS)) { + if (unlikely(desc->istate & IRQS_INPROGRESS)) { raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(desc); cpu_relax(); @@ -897,8 +897,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) desc->status |= IRQ_PER_CPU; #endif - desc->status &= ~(IRQ_WAITING | IRQ_ONESHOT | IRQ_INPROGRESS); - desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED); + desc->status &= ~(IRQ_WAITING | IRQ_ONESHOT); + desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ + IRQS_INPROGRESS); if (new->flags & IRQF_ONESHOT) desc->status |= IRQ_ONESHOT; diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 610f55597ce7..a96140eea409 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -5,3 +5,6 @@ enum { _IRQ_DEFAULT_INIT_FLAGS = IRQ_DEFAULT_INIT_FLAGS, }; + +#undef IRQ_INPROGRESS +#define IRQ_INPROGRESS GOT_YOU_MORON diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 21c46178b1a6..51504837d8cc 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -45,10 +45,10 @@ bool irq_wait_for_poll(struct irq_desc *desc) #ifdef CONFIG_SMP do { raw_spin_unlock(&desc->lock); - while (desc->status & IRQ_INPROGRESS) + while (desc->istate & IRQS_INPROGRESS) cpu_relax(); raw_spin_lock(&desc->lock); - } while (desc->status & IRQ_INPROGRESS); + } while (desc->istate & IRQS_INPROGRESS); /* Might have been disabled in meantime */ return !(desc->status & IRQ_DISABLED) && desc->action; #else @@ -88,7 +88,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) goto out; /* Already running on another processor */ - if (desc->status & IRQ_INPROGRESS) { + if (desc->istate & IRQS_INPROGRESS) { /* * Already running: If it is shared get the other * CPU to go looking for our mystery interrupt too From 3d67baec7f1b01fc289ac1a2f1a7e6d5e43391c6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 21:02:10 +0100 Subject: [PATCH 361/706] genirq: Move IRQ_ONESHOT to core No users outside of core. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 1 - kernel/irq/chip.c | 2 +- kernel/irq/internals.h | 2 ++ kernel/irq/manage.c | 8 ++++---- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index 274590fc55a3..1a4c723e74e1 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -71,7 +71,6 @@ typedef void (*irq_flow_handler_t)(unsigned int irq, #define IRQ_MOVE_PCNTXT 0x01000000 /* IRQ migration from process context */ #define IRQ_AFFINITY_SET 0x02000000 /* IRQ affinity was set from userspace*/ #define IRQ_SUSPENDED 0x04000000 /* IRQ has gone through suspend sequence */ -#define IRQ_ONESHOT 0x08000000 /* IRQ is not unmasked after hardirq */ #define IRQ_NESTED_THREAD 0x10000000 /* IRQ is nested into another, no own handler thread */ #define IRQF_MODIFY_MASK \ diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 075385549dcd..420fa6bdb117 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -472,7 +472,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) handle_irq_event(desc); - if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT))) + if (!(desc->status & IRQ_DISABLED) && !(desc->istate & IRQS_ONESHOT)) unmask_irq(desc); out_unlock: raw_spin_unlock(&desc->lock); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index d1cb1f8df6fe..36563f731ff8 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -42,12 +42,14 @@ enum { * detection * IRQS_POLL_INPROGRESS - polling in progress * IRQS_INPROGRESS - Interrupt in progress + * IRQS_ONESHOT - irq is not unmasked in primary handler */ enum { IRQS_AUTODETECT = 0x00000001, IRQS_SPURIOUS_DISABLED = 0x00000002, IRQS_POLL_INPROGRESS = 0x00000008, IRQS_INPROGRESS = 0x00000010, + IRQS_ONESHOT = 0x00000020, }; #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 7e5a50825088..aca4208a03ac 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -697,7 +697,7 @@ static int irq_thread(void *data) }; struct irqaction *action = data; struct irq_desc *desc = irq_to_desc(action->irq); - int wake, oneshot = desc->status & IRQ_ONESHOT; + int wake, oneshot = desc->istate & IRQS_ONESHOT; sched_setscheduler(current, SCHED_FIFO, ¶m); current->irqaction = action; @@ -897,12 +897,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) desc->status |= IRQ_PER_CPU; #endif - desc->status &= ~(IRQ_WAITING | IRQ_ONESHOT); + desc->status &= ~IRQ_WAITING; desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ - IRQS_INPROGRESS); + IRQS_INPROGRESS | IRQS_ONESHOT); if (new->flags & IRQF_ONESHOT) - desc->status |= IRQ_ONESHOT; + desc->istate |= IRQS_ONESHOT; if (!(desc->status & IRQ_NOAUTOEN)) irq_startup(desc); From 163ef3091195f514a06f064b12914597d2644c55 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Feb 2011 11:39:15 +0100 Subject: [PATCH 362/706] genirq: Move IRQ_REPLAY and IRQ_WAITING to core No users outside of core. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 9 +++++---- kernel/irq/autoprobe.c | 11 +++++------ kernel/irq/chip.c | 9 ++++----- kernel/irq/internals.h | 8 ++++++-- kernel/irq/manage.c | 4 ++-- kernel/irq/resend.c | 7 +++++-- kernel/irq/settings.h | 4 ++++ 7 files changed, 31 insertions(+), 21 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index 1a4c723e74e1..c38dbd506656 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -53,12 +53,13 @@ typedef void (*irq_flow_handler_t)(unsigned int irq, #ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT #define IRQ_INPROGRESS 0x00000100 /* DEPRECATED */ +#define IRQ_REPLAY 0x00000200 /* DEPRECATED */ +#define IRQ_WAITING 0x00000400 /* DEPRECATED */ #endif -#define IRQ_DISABLED 0x00000200 /* IRQ disabled - do not enter! */ -#define IRQ_PENDING 0x00000400 /* IRQ pending - replay on enable */ -#define IRQ_REPLAY 0x00000800 /* IRQ has been replayed but not acked yet */ -#define IRQ_WAITING 0x00002000 /* IRQ not yet seen - for autodetection */ +#define IRQ_DISABLED 0x00000800 /* IRQ disabled - do not enter! */ +#define IRQ_PENDING 0x00001000 /* IRQ pending - replay on enable */ + #define IRQ_LEVEL 0x00004000 /* IRQ level triggered */ #define IRQ_MASKED 0x00008000 /* IRQ masked - shouldn't be seen again */ #define IRQ_PER_CPU 0x00010000 /* IRQ is per CPU */ diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 916e56e10a2e..9ea8bb99f7c1 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -17,7 +17,7 @@ /* * Autodetection depends on the fact that any interrupt that * comes in on to an unassigned handler will get stuck with - * "IRQ_WAITING" cleared and the interrupt disabled. + * "IRQS_WAITING" cleared and the interrupt disabled. */ static DEFINE_MUTEX(probing_active); @@ -75,8 +75,7 @@ unsigned long probe_irq_on(void) for_each_irq_desc_reverse(i, desc) { raw_spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { - desc->istate |= IRQS_AUTODETECT; - desc->status |= IRQ_WAITING; + desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; if (irq_startup(desc)) desc->status |= IRQ_PENDING; } @@ -96,7 +95,7 @@ unsigned long probe_irq_on(void) if (desc->istate & IRQS_AUTODETECT) { /* It triggered already - consider it spurious. */ - if (!(desc->status & IRQ_WAITING)) { + if (!(desc->istate & IRQS_WAITING)) { desc->istate &= ~IRQS_AUTODETECT; irq_shutdown(desc); } else @@ -131,7 +130,7 @@ unsigned int probe_irq_mask(unsigned long val) for_each_irq_desc(i, desc) { raw_spin_lock_irq(&desc->lock); if (desc->istate & IRQS_AUTODETECT) { - if (i < 16 && !(desc->status & IRQ_WAITING)) + if (i < 16 && !(desc->istate & IRQS_WAITING)) mask |= 1 << i; desc->istate &= ~IRQS_AUTODETECT; @@ -171,7 +170,7 @@ int probe_irq_off(unsigned long val) raw_spin_lock_irq(&desc->lock); if (desc->istate & IRQS_AUTODETECT) { - if (!(desc->status & IRQ_WAITING)) { + if (!(desc->istate & IRQS_WAITING)) { if (!nr_of_irqs) irq_found = i; nr_of_irqs++; diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 420fa6bdb117..59ae14527ecd 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -428,7 +428,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) if (!irq_check_poll(desc)) goto out_unlock; - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); if (unlikely(!desc->action || (desc->status & IRQ_DISABLED))) @@ -460,7 +460,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) if (!irq_check_poll(desc)) goto out_unlock; - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); /* @@ -498,7 +498,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) if (!irq_check_poll(desc)) goto out; - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); /* @@ -537,8 +537,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) { raw_spin_lock(&desc->lock); - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); - + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); /* * If we're currently running this IRQ, or its disabled, * we shouldn't process the IRQ. Mark it pending, handle diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 36563f731ff8..54037533af7a 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -43,6 +43,8 @@ enum { * IRQS_POLL_INPROGRESS - polling in progress * IRQS_INPROGRESS - Interrupt in progress * IRQS_ONESHOT - irq is not unmasked in primary handler + * IRQS_REPLAY - irq is replayed + * IRQS_WAITING - irq is waiting */ enum { IRQS_AUTODETECT = 0x00000001, @@ -50,6 +52,8 @@ enum { IRQS_POLL_INPROGRESS = 0x00000008, IRQS_INPROGRESS = 0x00000010, IRQS_ONESHOT = 0x00000020, + IRQS_REPLAY = 0x00000040, + IRQS_WAITING = 0x00000080, }; #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) @@ -135,8 +139,6 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) P(IRQ_DISABLED); P(IRQ_PENDING); - P(IRQ_REPLAY); - P(IRQ_WAITING); P(IRQ_LEVEL); P(IRQ_MASKED); #ifdef CONFIG_IRQ_PER_CPU @@ -148,6 +150,8 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) PS(IRQS_AUTODETECT); PS(IRQS_INPROGRESS); + PS(IRQS_REPLAY); + PS(IRQS_WAITING); } #undef P diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index aca4208a03ac..7971df53d6a9 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -897,9 +897,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) desc->status |= IRQ_PER_CPU; #endif - desc->status &= ~IRQ_WAITING; desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ - IRQS_INPROGRESS | IRQS_ONESHOT); + IRQS_INPROGRESS | IRQS_ONESHOT | \ + IRQS_WAITING); if (new->flags & IRQF_ONESHOT) desc->istate |= IRQS_ONESHOT; diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 60b20261041b..f83387cd11f3 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -62,8 +62,11 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) */ if (desc->status & IRQ_LEVEL) return; - if ((desc->status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { - desc->status = (desc->status & ~IRQ_PENDING) | IRQ_REPLAY; + if (desc->istate & IRQS_REPLAY) + return; + if (desc->status & IRQ_PENDING) { + desc->status &= ~IRQ_PENDING; + desc->istate |= IRQS_REPLAY; if (!desc->irq_data.chip->irq_retrigger || !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index a96140eea409..2e7d08fff0ce 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -8,3 +8,7 @@ enum { #undef IRQ_INPROGRESS #define IRQ_INPROGRESS GOT_YOU_MORON +#undef IRQ_REPLAY +#define IRQ_REPLAY GOT_YOU_MORON +#undef IRQ_WAITING +#define IRQ_WAITING GOT_YOU_MORON From c1594b77e46124bb462f961e536120e471c67446 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 22:11:30 +0100 Subject: [PATCH 363/706] genirq: Move IRQ_DISABLED to core Keep status in sync until all abusers are fixed. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 4 ++-- kernel/irq/chip.c | 48 +++++++++++++++++++++++++++--------------- kernel/irq/compat.h | 12 +++++++++++ kernel/irq/internals.h | 4 +++- kernel/irq/irqdesc.c | 2 ++ kernel/irq/manage.c | 4 ++-- kernel/irq/migration.c | 2 +- kernel/irq/settings.h | 2 ++ kernel/irq/spurious.c | 4 ++-- 9 files changed, 57 insertions(+), 25 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index c38dbd506656..32efca71ce88 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -55,9 +55,9 @@ typedef void (*irq_flow_handler_t)(unsigned int irq, #define IRQ_INPROGRESS 0x00000100 /* DEPRECATED */ #define IRQ_REPLAY 0x00000200 /* DEPRECATED */ #define IRQ_WAITING 0x00000400 /* DEPRECATED */ +#define IRQ_DISABLED 0x00000800 /* DEPRECATED */ #endif -#define IRQ_DISABLED 0x00000800 /* IRQ disabled - do not enter! */ #define IRQ_PENDING 0x00001000 /* IRQ pending - replay on enable */ #define IRQ_LEVEL 0x00004000 /* IRQ level triggered */ @@ -231,7 +231,7 @@ struct irq_chip { # define ARCH_IRQ_INIT_FLAGS 0 #endif -#define IRQ_DEFAULT_INIT_FLAGS (IRQ_DISABLED | ARCH_IRQ_INIT_FLAGS) +#define IRQ_DEFAULT_INIT_FLAGS ARCH_IRQ_INIT_FLAGS struct irqaction; extern int setup_irq(unsigned int irq, struct irqaction *new); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 59ae14527ecd..527df7ab1b05 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -164,9 +164,21 @@ struct irq_data *irq_get_irq_data(unsigned int irq) } EXPORT_SYMBOL_GPL(irq_get_irq_data); +static void irq_state_clr_disabled(struct irq_desc *desc) +{ + desc->istate &= ~IRQS_DISABLED; + irq_compat_clr_disabled(desc); +} + +static void irq_state_set_disabled(struct irq_desc *desc) +{ + desc->istate |= IRQS_DISABLED; + irq_compat_set_disabled(desc); +} + int irq_startup(struct irq_desc *desc) { - desc->status &= ~IRQ_DISABLED; + irq_state_clr_disabled(desc); desc->depth = 0; if (desc->irq_data.chip->irq_startup) { @@ -181,7 +193,7 @@ int irq_startup(struct irq_desc *desc) void irq_shutdown(struct irq_desc *desc) { - desc->status |= IRQ_DISABLED; + irq_state_set_disabled(desc); desc->depth = 1; if (desc->irq_data.chip->irq_shutdown) desc->irq_data.chip->irq_shutdown(&desc->irq_data); @@ -194,7 +206,7 @@ void irq_shutdown(struct irq_desc *desc) void irq_enable(struct irq_desc *desc) { - desc->status &= ~IRQ_DISABLED; + irq_state_clr_disabled(desc); if (desc->irq_data.chip->irq_enable) desc->irq_data.chip->irq_enable(&desc->irq_data); else @@ -204,7 +216,7 @@ void irq_enable(struct irq_desc *desc) void irq_disable(struct irq_desc *desc) { - desc->status |= IRQ_DISABLED; + irq_state_set_disabled(desc); if (desc->irq_data.chip->irq_disable) { desc->irq_data.chip->irq_disable(&desc->irq_data); desc->status |= IRQ_MASKED; @@ -380,7 +392,7 @@ void handle_nested_irq(unsigned int irq) kstat_incr_irqs_this_cpu(irq, desc); action = desc->action; - if (unlikely(!action || (desc->status & IRQ_DISABLED))) + if (unlikely(!action || (desc->istate & IRQS_DISABLED))) goto out_unlock; irq_compat_set_progress(desc); @@ -431,7 +443,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); - if (unlikely(!desc->action || (desc->status & IRQ_DISABLED))) + if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) goto out_unlock; handle_irq_event(desc); @@ -467,12 +479,12 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) * If its disabled or no action available * keep it masked and get out of here */ - if (unlikely(!desc->action || (desc->status & IRQ_DISABLED))) + if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) goto out_unlock; handle_irq_event(desc); - if (!(desc->status & IRQ_DISABLED) && !(desc->istate & IRQS_ONESHOT)) + if (!(desc->istate & (IRQS_DISABLED | IRQS_ONESHOT))) unmask_irq(desc); out_unlock: raw_spin_unlock(&desc->lock); @@ -505,7 +517,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) * If its disabled or no action available * then mask it and get out of here: */ - if (unlikely(!desc->action || (desc->status & IRQ_DISABLED))) { + if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) { desc->status |= IRQ_PENDING; mask_irq(desc); goto out; @@ -543,8 +555,8 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) * we shouldn't process the IRQ. Mark it pending, handle * the necessary masking and go out */ - if (unlikely((desc->istate & (IRQS_INPROGRESS) || - (desc->status & IRQ_DISABLED) || !desc->action))) { + if (unlikely((desc->istate & (IRQS_DISABLED | IRQS_INPROGRESS) || + !desc->action))) { if (!irq_check_poll(desc)) { desc->status |= IRQ_PENDING; mask_ack_irq(desc); @@ -567,15 +579,16 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) * one, we could have masked the irq. * Renable it, if it was not disabled in meantime. */ - if (unlikely((desc->status & - (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == - (IRQ_PENDING | IRQ_MASKED))) { - unmask_irq(desc); + if (unlikely(desc->status & IRQ_PENDING)) { + if (!(desc->istate & IRQS_DISABLED) && + (desc->status & IRQ_MASKED)) + unmask_irq(desc); } handle_irq_event(desc); - } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); + } while ((desc->status & IRQ_PENDING) && + !(desc->istate & IRQS_DISABLED)); out_unlock: raw_spin_unlock(&desc->lock); @@ -639,7 +652,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, if (handle == handle_bad_irq) { if (desc->irq_data.chip != &no_irq_chip) mask_ack_irq(desc); - desc->status |= IRQ_DISABLED; + irq_compat_set_disabled(desc); + desc->istate |= IRQS_DISABLED; desc->depth = 1; } desc->handle_irq = handle; diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h index aac6e400e608..bc0c2a501e82 100644 --- a/kernel/irq/compat.h +++ b/kernel/irq/compat.h @@ -11,7 +11,19 @@ static inline void irq_compat_clr_progress(struct irq_desc *desc) { desc->status &= ~IRQ_INPROGRESS; } +static inline void irq_compat_set_disabled(struct irq_desc *desc) +{ + desc->status |= IRQ_DISABLED; +} + +static inline void irq_compat_clr_disabled(struct irq_desc *desc) +{ + desc->status &= ~IRQ_DISABLED; +} #else static inline void irq_compat_set_progress(struct irq_desc *desc) { } static inline void irq_compat_clr_progress(struct irq_desc *desc) { } +static inline void irq_compat_set_disabled(struct irq_desc *desc) { } +static inline void irq_compat_clr_disabled(struct irq_desc *desc) { } #endif + diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 54037533af7a..919d2dd0bb33 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -45,6 +45,7 @@ enum { * IRQS_ONESHOT - irq is not unmasked in primary handler * IRQS_REPLAY - irq is replayed * IRQS_WAITING - irq is waiting + * IRQS_DISABLED - irq is disabled */ enum { IRQS_AUTODETECT = 0x00000001, @@ -54,6 +55,7 @@ enum { IRQS_ONESHOT = 0x00000020, IRQS_REPLAY = 0x00000040, IRQS_WAITING = 0x00000080, + IRQS_DISABLED = 0x00000100, }; #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) @@ -137,7 +139,6 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) print_symbol("%s\n", (unsigned long)desc->action->handler); } - P(IRQ_DISABLED); P(IRQ_PENDING); P(IRQ_LEVEL); P(IRQ_MASKED); @@ -152,6 +153,7 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) PS(IRQS_INPROGRESS); PS(IRQS_REPLAY); PS(IRQS_WAITING); + PS(IRQS_DISABLED); } #undef P diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 8b87f2ce0203..78866d050bc9 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -80,6 +80,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) desc->irq_data.handler_data = NULL; desc->irq_data.msi_desc = NULL; desc->status = _IRQ_DEFAULT_INIT_FLAGS; + desc->istate = IRQS_DISABLED; desc->handle_irq = handle_bad_irq; desc->depth = 1; desc->irq_count = 0; @@ -247,6 +248,7 @@ int __init early_irq_init(void) struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { .status = _IRQ_DEFAULT_INIT_FLAGS, + .istate = IRQS_DISABLED, .handle_irq = handle_bad_irq, .depth = 1, .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 7971df53d6a9..77ff275b54cf 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -646,7 +646,7 @@ again: goto again; } - if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { + if (!(desc->istate & IRQS_DISABLED) && (desc->status & IRQ_MASKED)) { desc->status &= ~IRQ_MASKED; desc->irq_data.chip->irq_unmask(&desc->irq_data); } @@ -709,7 +709,7 @@ static int irq_thread(void *data) atomic_inc(&desc->threads_active); raw_spin_lock_irq(&desc->lock); - if (unlikely(desc->status & IRQ_DISABLED)) { + if (unlikely(desc->istate & IRQS_DISABLED)) { /* * CHECKME: We might need a dedicated * IRQ_THREAD_PENDING flag here, which diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 441fd629ff04..8c68cb8555a7 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -61,7 +61,7 @@ void move_native_irq(int irq) if (likely(!(desc->status & IRQ_MOVE_PENDING))) return; - if (unlikely(desc->status & IRQ_DISABLED)) + if (unlikely(desc->istate & IRQS_DISABLED)) return; /* diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 2e7d08fff0ce..5e3411c7c62b 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -12,3 +12,5 @@ enum { #define IRQ_REPLAY GOT_YOU_MORON #undef IRQ_WAITING #define IRQ_WAITING GOT_YOU_MORON +#undef IRQ_DISABLED +#define IRQ_DISABLED GOT_YOU_MORON diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 51504837d8cc..367614f858ff 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -50,7 +50,7 @@ bool irq_wait_for_poll(struct irq_desc *desc) raw_spin_lock(&desc->lock); } while (desc->istate & IRQS_INPROGRESS); /* Might have been disabled in meantime */ - return !(desc->status & IRQ_DISABLED) && desc->action; + return !(desc->istate & IRQS_DISABLED) && desc->action; #else return false; #endif @@ -75,7 +75,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) * Do not poll disabled interrupts unless the spurious * disabled poller asks explicitely. */ - if ((desc->status & IRQ_DISABLED) && !force) + if ((desc->istate & IRQS_DISABLED) && !force) goto out; /* From 2a0d6fb335d4428285dab2d254911748e6040807 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Feb 2011 12:17:57 +0100 Subject: [PATCH 364/706] genirq: Move IRQ_PENDING flag to core Keep status in sync until all users are fixed. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 2 +- kernel/irq/autoprobe.c | 6 ++++-- kernel/irq/chip.c | 10 ++++++---- kernel/irq/compat.h | 12 +++++++++++- kernel/irq/handle.c | 3 ++- kernel/irq/internals.h | 4 +++- kernel/irq/manage.c | 5 +++-- kernel/irq/pm.c | 3 ++- kernel/irq/resend.c | 5 +++-- kernel/irq/settings.h | 2 ++ kernel/irq/spurious.c | 5 +++-- 11 files changed, 40 insertions(+), 17 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index 32efca71ce88..7ca55c9deba4 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -56,9 +56,9 @@ typedef void (*irq_flow_handler_t)(unsigned int irq, #define IRQ_REPLAY 0x00000200 /* DEPRECATED */ #define IRQ_WAITING 0x00000400 /* DEPRECATED */ #define IRQ_DISABLED 0x00000800 /* DEPRECATED */ +#define IRQ_PENDING 0x00001000 /* DEPRECATED */ #endif -#define IRQ_PENDING 0x00001000 /* IRQ pending - replay on enable */ #define IRQ_LEVEL 0x00004000 /* IRQ level triggered */ #define IRQ_MASKED 0x00008000 /* IRQ masked - shouldn't be seen again */ diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 9ea8bb99f7c1..aab64c262726 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -76,8 +76,10 @@ unsigned long probe_irq_on(void) raw_spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; - if (irq_startup(desc)) - desc->status |= IRQ_PENDING; + if (irq_startup(desc)) { + irq_compat_set_pending(desc); + desc->istate |= IRQS_PENDING; + } } raw_spin_unlock_irq(&desc->lock); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 527df7ab1b05..17c87865bfb1 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -518,7 +518,8 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) * then mask it and get out of here: */ if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) { - desc->status |= IRQ_PENDING; + irq_compat_set_pending(desc); + desc->istate |= IRQS_PENDING; mask_irq(desc); goto out; } @@ -558,7 +559,8 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) if (unlikely((desc->istate & (IRQS_DISABLED | IRQS_INPROGRESS) || !desc->action))) { if (!irq_check_poll(desc)) { - desc->status |= IRQ_PENDING; + irq_compat_set_pending(desc); + desc->istate |= IRQS_PENDING; mask_ack_irq(desc); goto out_unlock; } @@ -579,7 +581,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) * one, we could have masked the irq. * Renable it, if it was not disabled in meantime. */ - if (unlikely(desc->status & IRQ_PENDING)) { + if (unlikely(desc->istate & IRQS_PENDING)) { if (!(desc->istate & IRQS_DISABLED) && (desc->status & IRQ_MASKED)) unmask_irq(desc); @@ -587,7 +589,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) handle_irq_event(desc); - } while ((desc->status & IRQ_PENDING) && + } while ((desc->istate & IRQS_PENDING) && !(desc->istate & IRQS_DISABLED)); out_unlock: diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h index bc0c2a501e82..0067a69781f4 100644 --- a/kernel/irq/compat.h +++ b/kernel/irq/compat.h @@ -15,15 +15,25 @@ static inline void irq_compat_set_disabled(struct irq_desc *desc) { desc->status |= IRQ_DISABLED; } - static inline void irq_compat_clr_disabled(struct irq_desc *desc) { desc->status &= ~IRQ_DISABLED; } +static inline void irq_compat_set_pending(struct irq_desc *desc) +{ + desc->status |= IRQ_PENDING; +} + +static inline void irq_compat_clr_pending(struct irq_desc *desc) +{ + desc->status &= ~IRQ_PENDING; +} #else static inline void irq_compat_set_progress(struct irq_desc *desc) { } static inline void irq_compat_clr_progress(struct irq_desc *desc) { } static inline void irq_compat_set_disabled(struct irq_desc *desc) { } static inline void irq_compat_clr_disabled(struct irq_desc *desc) { } +static inline void irq_compat_set_pending(struct irq_desc *desc) { } +static inline void irq_compat_clr_pending(struct irq_desc *desc) { } #endif diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index d4ae0b1ccc00..6e34bdbeb26a 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -122,7 +122,8 @@ irqreturn_t handle_irq_event(struct irq_desc *desc) struct irqaction *action = desc->action; irqreturn_t ret; - desc->status &= ~IRQ_PENDING; + irq_compat_clr_pending(desc); + desc->istate &= ~IRQS_PENDING; irq_compat_set_progress(desc); desc->istate |= IRQS_INPROGRESS; raw_spin_unlock(&desc->lock); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 919d2dd0bb33..fdf2524437eb 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -46,6 +46,7 @@ enum { * IRQS_REPLAY - irq is replayed * IRQS_WAITING - irq is waiting * IRQS_DISABLED - irq is disabled + * IRQS_PENDING - irq is pending and replayed later */ enum { IRQS_AUTODETECT = 0x00000001, @@ -56,6 +57,7 @@ enum { IRQS_REPLAY = 0x00000040, IRQS_WAITING = 0x00000080, IRQS_DISABLED = 0x00000100, + IRQS_PENDING = 0x00000200, }; #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) @@ -139,7 +141,6 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) print_symbol("%s\n", (unsigned long)desc->action->handler); } - P(IRQ_PENDING); P(IRQ_LEVEL); P(IRQ_MASKED); #ifdef CONFIG_IRQ_PER_CPU @@ -154,6 +155,7 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) PS(IRQS_REPLAY); PS(IRQS_WAITING); PS(IRQS_DISABLED); + PS(IRQS_PENDING); } #undef P diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 77ff275b54cf..ac060814a787 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -714,10 +714,11 @@ static int irq_thread(void *data) * CHECKME: We might need a dedicated * IRQ_THREAD_PENDING flag here, which * retriggers the thread in check_irq_resend() - * but AFAICT IRQ_PENDING should be fine as it + * but AFAICT IRQS_PENDING should be fine as it * retriggers the interrupt itself --- tglx */ - desc->status |= IRQ_PENDING; + irq_compat_set_pending(desc); + desc->istate |= IRQS_PENDING; raw_spin_unlock_irq(&desc->lock); } else { raw_spin_unlock_irq(&desc->lock); diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index d6bfb89cce91..d7389418e91a 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -69,7 +69,8 @@ int check_wakeup_irqs(void) int irq; for_each_irq_desc(irq, desc) - if ((desc->status & IRQ_WAKEUP) && (desc->status & IRQ_PENDING)) + if ((desc->status & IRQ_WAKEUP) && + (desc->istate & IRQS_PENDING)) return -EBUSY; return 0; diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index f83387cd11f3..ff1fea060014 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -64,8 +64,9 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) return; if (desc->istate & IRQS_REPLAY) return; - if (desc->status & IRQ_PENDING) { - desc->status &= ~IRQ_PENDING; + if (desc->istate & IRQS_PENDING) { + irq_compat_clr_pending(desc); + desc->istate &= ~IRQS_PENDING; desc->istate |= IRQS_REPLAY; if (!desc->irq_data.chip->irq_retrigger || diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 5e3411c7c62b..623fcf83e7de 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -14,3 +14,5 @@ enum { #define IRQ_WAITING GOT_YOU_MORON #undef IRQ_DISABLED #define IRQ_DISABLED GOT_YOU_MORON +#undef IRQ_PENDING +#define IRQ_PENDING GOT_YOU_MORON diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 367614f858ff..692ce2bae302 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -93,7 +93,8 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) * Already running: If it is shared get the other * CPU to go looking for our mystery interrupt too */ - desc->status |= IRQ_PENDING; + irq_compat_set_pending(desc); + desc->istate |= IRQS_PENDING; goto out; } @@ -103,7 +104,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) if (handle_irq_event(desc) == IRQ_HANDLED) ret = IRQ_HANDLED; action = desc->action; - } while ((desc->status & IRQ_PENDING) && action); + } while ((desc->istate & IRQS_PENDING) && action); desc->istate &= ~IRQS_POLL_INPROGRESS; out: raw_spin_unlock(&desc->lock); From 6e40262ea43c4b0e3f435b3a083e4461ef921c17 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Feb 2011 12:36:06 +0100 Subject: [PATCH 365/706] genirq: Move IRQ_MASKED to core Keep status in sync until all users are fixed. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 2 +- kernel/irq/chip.c | 28 ++++++++++++++++++++-------- kernel/irq/compat.h | 11 +++++++++++ kernel/irq/internals.h | 4 +++- kernel/irq/manage.c | 5 +++-- kernel/irq/migration.c | 2 +- kernel/irq/settings.h | 2 ++ 7 files changed, 41 insertions(+), 13 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index 7ca55c9deba4..9800bac4c398 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -57,11 +57,11 @@ typedef void (*irq_flow_handler_t)(unsigned int irq, #define IRQ_WAITING 0x00000400 /* DEPRECATED */ #define IRQ_DISABLED 0x00000800 /* DEPRECATED */ #define IRQ_PENDING 0x00001000 /* DEPRECATED */ +#define IRQ_MASKED 0x00002000 /* DEPRECATED */ #endif #define IRQ_LEVEL 0x00004000 /* IRQ level triggered */ -#define IRQ_MASKED 0x00008000 /* IRQ masked - shouldn't be seen again */ #define IRQ_PER_CPU 0x00010000 /* IRQ is per CPU */ #define IRQ_NOPROBE 0x00020000 /* IRQ is not valid for probing */ #define IRQ_NOREQUEST 0x00040000 /* IRQ cannot be requested */ diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 17c87865bfb1..73b2e7e00934 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -176,6 +176,18 @@ static void irq_state_set_disabled(struct irq_desc *desc) irq_compat_set_disabled(desc); } +static void irq_state_clr_masked(struct irq_desc *desc) +{ + desc->istate &= ~IRQS_MASKED; + irq_compat_clr_masked(desc); +} + +static void irq_state_set_masked(struct irq_desc *desc) +{ + desc->istate |= IRQS_MASKED; + irq_compat_set_masked(desc); +} + int irq_startup(struct irq_desc *desc) { irq_state_clr_disabled(desc); @@ -183,7 +195,7 @@ int irq_startup(struct irq_desc *desc) if (desc->irq_data.chip->irq_startup) { int ret = desc->irq_data.chip->irq_startup(&desc->irq_data); - desc->status &= ~IRQ_MASKED; + irq_state_clr_masked(desc); return ret; } @@ -201,7 +213,7 @@ void irq_shutdown(struct irq_desc *desc) desc->irq_data.chip->irq_disable(&desc->irq_data); else desc->irq_data.chip->irq_mask(&desc->irq_data); - desc->status |= IRQ_MASKED; + irq_state_set_masked(desc); } void irq_enable(struct irq_desc *desc) @@ -211,7 +223,7 @@ void irq_enable(struct irq_desc *desc) desc->irq_data.chip->irq_enable(&desc->irq_data); else desc->irq_data.chip->irq_unmask(&desc->irq_data); - desc->status &= ~IRQ_MASKED; + irq_state_clr_masked(desc); } void irq_disable(struct irq_desc *desc) @@ -219,8 +231,8 @@ void irq_disable(struct irq_desc *desc) irq_state_set_disabled(desc); if (desc->irq_data.chip->irq_disable) { desc->irq_data.chip->irq_disable(&desc->irq_data); - desc->status |= IRQ_MASKED; } + irq_state_set_masked(desc); } #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED @@ -352,14 +364,14 @@ static inline void mask_ack_irq(struct irq_desc *desc) if (desc->irq_data.chip->irq_ack) desc->irq_data.chip->irq_ack(&desc->irq_data); } - desc->status |= IRQ_MASKED; + irq_state_set_masked(desc); } static inline void mask_irq(struct irq_desc *desc) { if (desc->irq_data.chip->irq_mask) { desc->irq_data.chip->irq_mask(&desc->irq_data); - desc->status |= IRQ_MASKED; + irq_state_set_masked(desc); } } @@ -367,7 +379,7 @@ static inline void unmask_irq(struct irq_desc *desc) { if (desc->irq_data.chip->irq_unmask) { desc->irq_data.chip->irq_unmask(&desc->irq_data); - desc->status &= ~IRQ_MASKED; + irq_state_clr_masked(desc); } } @@ -583,7 +595,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) */ if (unlikely(desc->istate & IRQS_PENDING)) { if (!(desc->istate & IRQS_DISABLED) && - (desc->status & IRQ_MASKED)) + (desc->istate & IRQS_MASKED)) unmask_irq(desc); } diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h index 0067a69781f4..593abecbcc44 100644 --- a/kernel/irq/compat.h +++ b/kernel/irq/compat.h @@ -28,6 +28,15 @@ static inline void irq_compat_clr_pending(struct irq_desc *desc) { desc->status &= ~IRQ_PENDING; } +static inline void irq_compat_set_masked(struct irq_desc *desc) +{ + desc->status |= IRQ_MASKED; +} + +static inline void irq_compat_clr_masked(struct irq_desc *desc) +{ + desc->status &= ~IRQ_MASKED; +} #else static inline void irq_compat_set_progress(struct irq_desc *desc) { } static inline void irq_compat_clr_progress(struct irq_desc *desc) { } @@ -35,5 +44,7 @@ static inline void irq_compat_set_disabled(struct irq_desc *desc) { } static inline void irq_compat_clr_disabled(struct irq_desc *desc) { } static inline void irq_compat_set_pending(struct irq_desc *desc) { } static inline void irq_compat_clr_pending(struct irq_desc *desc) { } +static inline void irq_compat_set_masked(struct irq_desc *desc) { } +static inline void irq_compat_clr_masked(struct irq_desc *desc) { } #endif diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index fdf2524437eb..3f2fcc194dcc 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -47,6 +47,7 @@ enum { * IRQS_WAITING - irq is waiting * IRQS_DISABLED - irq is disabled * IRQS_PENDING - irq is pending and replayed later + * IRQS_MASKED - irq is masked */ enum { IRQS_AUTODETECT = 0x00000001, @@ -58,6 +59,7 @@ enum { IRQS_WAITING = 0x00000080, IRQS_DISABLED = 0x00000100, IRQS_PENDING = 0x00000200, + IRQS_MASKED = 0x00000400, }; #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) @@ -142,7 +144,6 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) } P(IRQ_LEVEL); - P(IRQ_MASKED); #ifdef CONFIG_IRQ_PER_CPU P(IRQ_PER_CPU); #endif @@ -156,6 +157,7 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) PS(IRQS_WAITING); PS(IRQS_DISABLED); PS(IRQS_PENDING); + PS(IRQS_MASKED); } #undef P diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ac060814a787..83fd20194e5b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -646,8 +646,9 @@ again: goto again; } - if (!(desc->istate & IRQS_DISABLED) && (desc->status & IRQ_MASKED)) { - desc->status &= ~IRQ_MASKED; + if (!(desc->istate & IRQS_DISABLED) && (desc->istate & IRQS_MASKED)) { + irq_compat_clr_masked(desc); + desc->istate &= ~IRQS_MASKED; desc->irq_data.chip->irq_unmask(&desc->irq_data); } raw_spin_unlock_irq(&desc->lock); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 8c68cb8555a7..6f2f98480354 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -69,7 +69,7 @@ void move_native_irq(int irq) * threaded interrupt with ONESHOT set, we can end up with an * interrupt storm. */ - masked = desc->status & IRQ_MASKED; + masked = desc->istate & IRQS_MASKED; if (!masked) desc->irq_data.chip->irq_mask(&desc->irq_data); move_masked_irq(irq); diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 623fcf83e7de..2cd45fd5ec8a 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -16,3 +16,5 @@ enum { #define IRQ_DISABLED GOT_YOU_MORON #undef IRQ_PENDING #define IRQ_PENDING GOT_YOU_MORON +#undef IRQ_MASKED +#define IRQ_MASKED GOT_YOU_MORON From c531e8361f1968d664e6e97fbd3bfa4cf0e62e42 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Feb 2011 12:44:58 +0100 Subject: [PATCH 366/706] genirq: Move IRQ_SUSPENDED to core No users outside of core. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 2 -- kernel/irq/internals.h | 2 ++ kernel/irq/manage.c | 8 ++++---- kernel/irq/pm.c | 6 +++--- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index 9800bac4c398..3ce45c257edb 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -60,7 +60,6 @@ typedef void (*irq_flow_handler_t)(unsigned int irq, #define IRQ_MASKED 0x00002000 /* DEPRECATED */ #endif - #define IRQ_LEVEL 0x00004000 /* IRQ level triggered */ #define IRQ_PER_CPU 0x00010000 /* IRQ is per CPU */ #define IRQ_NOPROBE 0x00020000 /* IRQ is not valid for probing */ @@ -71,7 +70,6 @@ typedef void (*irq_flow_handler_t)(unsigned int irq, #define IRQ_NO_BALANCING 0x00400000 /* IRQ is excluded from balancing */ #define IRQ_MOVE_PCNTXT 0x01000000 /* IRQ migration from process context */ #define IRQ_AFFINITY_SET 0x02000000 /* IRQ affinity was set from userspace*/ -#define IRQ_SUSPENDED 0x04000000 /* IRQ has gone through suspend sequence */ #define IRQ_NESTED_THREAD 0x10000000 /* IRQ is nested into another, no own handler thread */ #define IRQF_MODIFY_MASK \ diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 3f2fcc194dcc..46889119e6a6 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -48,6 +48,7 @@ enum { * IRQS_DISABLED - irq is disabled * IRQS_PENDING - irq is pending and replayed later * IRQS_MASKED - irq is masked + * IRQS_SUSPENDED - irq is suspended */ enum { IRQS_AUTODETECT = 0x00000001, @@ -60,6 +61,7 @@ enum { IRQS_DISABLED = 0x00000100, IRQS_PENDING = 0x00000200, IRQS_MASKED = 0x00000400, + IRQS_SUSPENDED = 0x00000800, }; #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 83fd20194e5b..b912de4ff4de 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -326,7 +326,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) if (suspend) { if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) return; - desc->status |= IRQ_SUSPENDED; + desc->istate |= IRQS_SUSPENDED; } if (!desc->depth++) @@ -388,7 +388,7 @@ EXPORT_SYMBOL(disable_irq); void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) { if (resume) { - if (!(desc->status & IRQ_SUSPENDED)) { + if (!(desc->istate & IRQS_SUSPENDED)) { if (!desc->action) return; if (!(desc->action->flags & IRQF_FORCE_RESUME)) @@ -396,7 +396,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) /* Pretend that it got disabled ! */ desc->depth++; } - desc->status &= ~IRQ_SUSPENDED; + desc->istate &= ~IRQS_SUSPENDED; } switch (desc->depth) { @@ -405,7 +405,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); break; case 1: { - if (desc->status & IRQ_SUSPENDED) + if (desc->istate & IRQS_SUSPENDED) goto err_out; /* Prevent probing on this irq: */ desc->status |= IRQ_NOPROBE; diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index d7389418e91a..d81337fc1cff 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -18,7 +18,7 @@ * During system-wide suspend or hibernation device drivers need to be prevented * from receiving interrupts and this function is provided for this purpose. * It marks all interrupt lines in use, except for the timer ones, as disabled - * and sets the IRQ_SUSPENDED flag for each of them. + * and sets the IRQS_SUSPENDED flag for each of them. */ void suspend_device_irqs(void) { @@ -34,7 +34,7 @@ void suspend_device_irqs(void) } for_each_irq_desc(irq, desc) - if (desc->status & IRQ_SUSPENDED) + if (desc->istate & IRQS_SUSPENDED) synchronize_irq(irq); } EXPORT_SYMBOL_GPL(suspend_device_irqs); @@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(suspend_device_irqs); * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() * * Enable all interrupt lines previously disabled by suspend_device_irqs() that - * have the IRQ_SUSPENDED flag set. + * have the IRQS_SUSPENDED flag set. */ void resume_device_irqs(void) { From 6d2cd17fde1fc3e93302815f049f255bb2b3123e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Feb 2011 14:34:18 +0100 Subject: [PATCH 367/706] genirq: Move IRQ_WAKEUP to core No users outside of core. Signed-off-by: Thomas Gleixner --- kernel/irq/internals.h | 2 ++ kernel/irq/manage.c | 4 ++-- kernel/irq/pm.c | 2 +- kernel/irq/settings.h | 2 ++ 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 46889119e6a6..cef0849dcfa5 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -49,6 +49,7 @@ enum { * IRQS_PENDING - irq is pending and replayed later * IRQS_MASKED - irq is masked * IRQS_SUSPENDED - irq is suspended + * IRQS_WAKEUP - irq triggers system wakeup from suspend */ enum { IRQS_AUTODETECT = 0x00000001, @@ -62,6 +63,7 @@ enum { IRQS_PENDING = 0x00000200, IRQS_MASKED = 0x00000400, IRQS_SUSPENDED = 0x00000800, + IRQS_WAKEUP = 0x00001000, }; #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index b912de4ff4de..ccc9389909ff 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -489,7 +489,7 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on) if (ret) desc->wake_depth = 0; else - desc->status |= IRQ_WAKEUP; + desc->istate |= IRQS_WAKEUP; } } else { if (desc->wake_depth == 0) { @@ -499,7 +499,7 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on) if (ret) desc->wake_depth = 1; else - desc->status &= ~IRQ_WAKEUP; + desc->istate &= ~IRQS_WAKEUP; } } diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index d81337fc1cff..f39383d8672d 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -69,7 +69,7 @@ int check_wakeup_irqs(void) int irq; for_each_irq_desc(irq, desc) - if ((desc->status & IRQ_WAKEUP) && + if ((desc->istate & IRQS_WAKEUP) && (desc->istate & IRQS_PENDING)) return -EBUSY; diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 2cd45fd5ec8a..ef09824e4b32 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -18,3 +18,5 @@ enum { #define IRQ_PENDING GOT_YOU_MORON #undef IRQ_MASKED #define IRQ_MASKED GOT_YOU_MORON +#undef IRQ_WAKEUP +#define IRQ_WAKEUP GOT_YOU_MORON From 91c499178139d6597e68db19638e4135510a34b8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 3 Feb 2011 20:48:29 +0100 Subject: [PATCH 368/706] genirq: Add state field to irq_data Some chip implementations need to access certain status flags. With sparse irqs that requires a lookup of the irq descriptor. Add a state field which contains such flags. Name it in a way which will make coders happy to access it with the proper accessor functions. And it's easy to grep for. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 3 +++ include/linux/irqdesc.h | 1 + 2 files changed, 4 insertions(+) diff --git a/include/linux/irq.h b/include/linux/irq.h index 3ce45c257edb..62bb08e4af13 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -102,6 +102,8 @@ struct msi_desc; * struct irq_data - per irq and irq chip data passed down to chip functions * @irq: interrupt number * @node: node index useful for balancing + * @state_use_accessor: status information for irq chip functions. + * Use accessor functions to deal with it * @chip: low level interrupt hardware access * @handler_data: per-IRQ data for the irq_chip methods * @chip_data: platform-specific per-chip private data for the chip @@ -116,6 +118,7 @@ struct msi_desc; struct irq_data { unsigned int irq; unsigned int node; + unsigned int state_use_accessors; struct irq_chip *chip; void *handler_data; void *chip_data; diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 782bf9851a9f..581d9665fd38 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -48,6 +48,7 @@ struct irq_desc { struct { unsigned int irq; unsigned int node; + unsigned int pad_do_not_even_think_about_it; struct irq_chip *chip; void *handler_data; void *chip_data; From f230b6d5c48f8d12f4dfa1f8b5ab0b0320076d21 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 5 Feb 2011 15:20:04 +0100 Subject: [PATCH 369/706] genirq: Add IRQ_MOVE_PENDING to irq_data.state chip implementations need to know about it. Keep status in sync until all users are fixed. Accessor function: irqd_is_setaffinity_pending(irqdata) Coders who access them directly will be tracked down and slapped with stinking trouts. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 20 ++++++++++++++++++-- kernel/irq/compat.h | 11 +++++++++++ kernel/irq/internals.h | 15 +++++++++++++++ kernel/irq/manage.c | 4 ++-- kernel/irq/migration.c | 6 +++--- kernel/irq/proc.c | 2 +- kernel/irq/settings.h | 2 ++ 7 files changed, 52 insertions(+), 8 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index 62bb08e4af13..2899905bfac7 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -58,15 +58,16 @@ typedef void (*irq_flow_handler_t)(unsigned int irq, #define IRQ_DISABLED 0x00000800 /* DEPRECATED */ #define IRQ_PENDING 0x00001000 /* DEPRECATED */ #define IRQ_MASKED 0x00002000 /* DEPRECATED */ +/* DEPRECATED use irq_setaffinity_pending() instead*/ +#define IRQ_MOVE_PENDING 0x00004000 #endif -#define IRQ_LEVEL 0x00004000 /* IRQ level triggered */ +#define IRQ_LEVEL 0x00008000 /* IRQ level triggered */ #define IRQ_PER_CPU 0x00010000 /* IRQ is per CPU */ #define IRQ_NOPROBE 0x00020000 /* IRQ is not valid for probing */ #define IRQ_NOREQUEST 0x00040000 /* IRQ cannot be requested */ #define IRQ_NOAUTOEN 0x00080000 /* IRQ will not be enabled on request irq */ #define IRQ_WAKEUP 0x00100000 /* IRQ triggers system wakeup */ -#define IRQ_MOVE_PENDING 0x00200000 /* need to re-target IRQ destination */ #define IRQ_NO_BALANCING 0x00400000 /* IRQ is excluded from balancing */ #define IRQ_MOVE_PCNTXT 0x01000000 /* IRQ migration from process context */ #define IRQ_AFFINITY_SET 0x02000000 /* IRQ affinity was set from userspace*/ @@ -128,6 +129,21 @@ struct irq_data { #endif }; +/* + * Bit masks for irq_data.state + * + * IRQD_SETAFFINITY_PENDING - Affinity setting is pending + */ +enum { + /* Bit 0 - 7 reserved for TYPE will use later */ + IRQD_SETAFFINITY_PENDING = (1 << 8), +}; + +static inline bool irqd_is_setaffinity_pending(struct irq_data *d) +{ + return d->state_use_accessors & IRQD_SETAFFINITY_PENDING; +} + /** * struct irq_chip - hardware interrupt chip descriptor * diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h index 593abecbcc44..5e33aadadacc 100644 --- a/kernel/irq/compat.h +++ b/kernel/irq/compat.h @@ -37,6 +37,15 @@ static inline void irq_compat_clr_masked(struct irq_desc *desc) { desc->status &= ~IRQ_MASKED; } +static inline void irq_compat_set_move_pending(struct irq_desc *desc) +{ + desc->status |= IRQ_MOVE_PENDING; +} + +static inline void irq_compat_clr_move_pending(struct irq_desc *desc) +{ + desc->status &= ~IRQ_MOVE_PENDING; +} #else static inline void irq_compat_set_progress(struct irq_desc *desc) { } static inline void irq_compat_clr_progress(struct irq_desc *desc) { } @@ -46,5 +55,7 @@ static inline void irq_compat_set_pending(struct irq_desc *desc) { } static inline void irq_compat_clr_pending(struct irq_desc *desc) { } static inline void irq_compat_set_masked(struct irq_desc *desc) { } static inline void irq_compat_clr_masked(struct irq_desc *desc) { } +static inline void irq_compat_set_move_pending(struct irq_desc *desc) { } +static inline void irq_compat_clr_move_pending(struct irq_desc *desc) { } #endif diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index cef0849dcfa5..e93e6090cd47 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -124,6 +124,21 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc) desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); } +/* + * Manipulation functions for irq_data.state + */ +static inline void irqd_set_move_pending(struct irq_data *d) +{ + d->state_use_accessors |= IRQD_SETAFFINITY_PENDING; + irq_compat_set_move_pending(irq_data_to_desc(d)); +} + +static inline void irqd_clr_move_pending(struct irq_data *d) +{ + d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING; + irq_compat_clr_move_pending(irq_data_to_desc(d)); +} + /* * Debugging printout: */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ccc9389909ff..9a99c471d470 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -107,7 +107,7 @@ static inline bool irq_can_move_pcntxt(struct irq_desc *desc) } static inline bool irq_move_pending(struct irq_desc *desc) { - return desc->status & IRQ_MOVE_PENDING; + return irqd_is_setaffinity_pending(&desc->irq_data); } static inline void irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) @@ -156,7 +156,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask) ret = 0; } } else { - desc->status |= IRQ_MOVE_PENDING; + irqd_set_move_pending(&desc->irq_data); irq_copy_pending(desc, mask); } diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 6f2f98480354..9485ae081dcd 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -9,7 +9,7 @@ void move_masked_irq(int irq) struct irq_desc *desc = irq_to_desc(irq); struct irq_chip *chip = desc->irq_data.chip; - if (likely(!(desc->status & IRQ_MOVE_PENDING))) + if (likely(!irqd_is_setaffinity_pending(&desc->irq_data))) return; /* @@ -20,7 +20,7 @@ void move_masked_irq(int irq) return; } - desc->status &= ~IRQ_MOVE_PENDING; + irqd_clr_move_pending(&desc->irq_data); if (unlikely(cpumask_empty(desc->pending_mask))) return; @@ -58,7 +58,7 @@ void move_native_irq(int irq) struct irq_desc *desc = irq_to_desc(irq); bool masked; - if (likely(!(desc->status & IRQ_MOVE_PENDING))) + if (likely(!irqd_is_setaffinity_pending(&desc->irq_data))) return; if (unlikely(desc->istate & IRQS_DISABLED)) diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 26449239bb46..afe4e6803148 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -25,7 +25,7 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v) const struct cpumask *mask = desc->irq_data.affinity; #ifdef CONFIG_GENERIC_PENDING_IRQ - if (desc->status & IRQ_MOVE_PENDING) + if (irqd_is_setaffinity_pending(&desc->irq_data)) mask = desc->pending_mask; #endif seq_cpumask(m, mask); diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index ef09824e4b32..bb104a2dce73 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -20,3 +20,5 @@ enum { #define IRQ_MASKED GOT_YOU_MORON #undef IRQ_WAKEUP #define IRQ_WAKEUP GOT_YOU_MORON +#undef IRQ_MOVE_PENDING +#define IRQ_MOVE_PENDING GOT_YOU_MORON From 6a58fb3bad099076f36f0f30f44507bc3275cdb6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Feb 2011 15:40:05 +0100 Subject: [PATCH 370/706] genirq: Remove CONFIG_IRQ_PER_CPU The saving of this switch is minimal versus the ifdef mess it creates. Simple enable PER_CPU unconditionally and remove the config switch. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 5 ----- kernel/irq/Kconfig | 3 --- kernel/irq/internals.h | 2 -- kernel/irq/manage.c | 9 +++------ 4 files changed, 3 insertions(+), 16 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index 2899905bfac7..ab708f27a33b 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -78,13 +78,8 @@ typedef void (*irq_flow_handler_t)(unsigned int irq, IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \ IRQ_PER_CPU | IRQ_NESTED_THREAD) -#ifdef CONFIG_IRQ_PER_CPU # define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU) # define IRQ_NO_BALANCING_MASK (IRQ_PER_CPU | IRQ_NO_BALANCING) -#else -# define CHECK_IRQ_PER_CPU(var) 0 -# define IRQ_NO_BALANCING_MASK IRQ_NO_BALANCING -#endif /* * Return value for chip->irq_set_affinity() diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 9e2256de1d1a..48ad25f5fa59 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -32,9 +32,6 @@ config GENERIC_PENDING_IRQ config AUTO_IRQ_AFFINITY def_bool n -config IRQ_PER_CPU - def_bool n - config HARDIRQS_SW_RESEND def_bool n diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index e93e6090cd47..9e32b3d35d35 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -163,9 +163,7 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) } P(IRQ_LEVEL); -#ifdef CONFIG_IRQ_PER_CPU P(IRQ_PER_CPU); -#endif P(IRQ_NOPROBE); P(IRQ_NOREQUEST); P(IRQ_NOAUTOEN); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 9a99c471d470..056aa49698b4 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -865,12 +865,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) goto mismatch; } -#if defined(CONFIG_IRQ_PER_CPU) /* All handlers must agree on per-cpuness */ if ((old->flags & IRQF_PERCPU) != (new->flags & IRQF_PERCPU)) goto mismatch; -#endif /* add new interrupt at end of irq queue */ do { @@ -894,15 +892,14 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) goto out_mask; } else compat_irq_chip_set_default_handler(desc); -#if defined(CONFIG_IRQ_PER_CPU) - if (new->flags & IRQF_PERCPU) - desc->status |= IRQ_PER_CPU; -#endif desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ IRQS_INPROGRESS | IRQS_ONESHOT | \ IRQS_WAITING); + if (new->flags & IRQF_PERCPU) + desc->status |= IRQ_PER_CPU; + if (new->flags & IRQF_ONESHOT) desc->istate |= IRQS_ONESHOT; From 8f53f92404bead2ab2154d45c8f508880bb5d95d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Feb 2011 16:50:00 +0100 Subject: [PATCH 371/706] genirq: Make CHECK_IRQ_PER_CPU an inline and deprecate it Its' too ugly and needs to go. The only users are core code and parisc. Core code does not need it and parisc gets a new check once IRQ_PER_CPU is reflected in irq_data.state. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index ab708f27a33b..3f607ad94220 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -78,8 +78,12 @@ typedef void (*irq_flow_handler_t)(unsigned int irq, IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \ IRQ_PER_CPU | IRQ_NESTED_THREAD) -# define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU) -# define IRQ_NO_BALANCING_MASK (IRQ_PER_CPU | IRQ_NO_BALANCING) +#define IRQ_NO_BALANCING_MASK (IRQ_PER_CPU | IRQ_NO_BALANCING) + +static inline __deprecated bool CHECK_IRQ_PER_CPU(unsigned int status) +{ + return status & IRQ_PER_CPU; +} /* * Return value for chip->irq_set_affinity() From fae581e588e64a0690f3fc995e404fcacaebe772 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Feb 2011 16:53:24 +0100 Subject: [PATCH 372/706] genirq: Remove CHECK_IRQ_PER_CPU from core code Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 4 ++-- kernel/irq/migration.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 056aa49698b4..f1cfa271ba70 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -73,8 +73,8 @@ int irq_can_set_affinity(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip || - !desc->irq_data.chip->irq_set_affinity) + if ((desc->status & (IRQ_PER_CPU | IRQ_NO_BALANCING)) || + !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) return 0; return 1; diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 9485ae081dcd..24f53caddf47 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -15,7 +15,7 @@ void move_masked_irq(int irq) /* * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. */ - if (CHECK_IRQ_PER_CPU(desc->status)) { + if (desc->status & (IRQ_PER_CPU | IRQ_NO_BALANCING)) { WARN_ON(1); return; } From 1ce6068dac1924f7095be5850481e790cbf1b3c1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Feb 2011 20:44:21 +0100 Subject: [PATCH 373/706] genirq: Move debug code to separate header It'll break when I'm going to undefine the constants. Signed-off-by: Thomas Gleixner --- kernel/irq/debug.h | 40 +++++++++++++++++++++++++++++++++++ kernel/irq/internals.h | 48 ++++-------------------------------------- 2 files changed, 44 insertions(+), 44 deletions(-) create mode 100644 kernel/irq/debug.h diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h new file mode 100644 index 000000000000..d1a33b7fa61d --- /dev/null +++ b/kernel/irq/debug.h @@ -0,0 +1,40 @@ +/* + * Debugging printout: + */ + +#include + +#define P(f) if (desc->status & f) printk("%14s set\n", #f) +#define PS(f) if (desc->istate & f) printk("%14s set\n", #f) + +static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) +{ + printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", + irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); + printk("->handle_irq(): %p, ", desc->handle_irq); + print_symbol("%s\n", (unsigned long)desc->handle_irq); + printk("->irq_data.chip(): %p, ", desc->irq_data.chip); + print_symbol("%s\n", (unsigned long)desc->irq_data.chip); + printk("->action(): %p\n", desc->action); + if (desc->action) { + printk("->action->handler(): %p, ", desc->action->handler); + print_symbol("%s\n", (unsigned long)desc->action->handler); + } + + P(IRQ_LEVEL); + P(IRQ_PER_CPU); + P(IRQ_NOPROBE); + P(IRQ_NOREQUEST); + P(IRQ_NOAUTOEN); + + PS(IRQS_AUTODETECT); + PS(IRQS_INPROGRESS); + PS(IRQS_REPLAY); + PS(IRQS_WAITING); + PS(IRQS_DISABLED); + PS(IRQS_PENDING); + PS(IRQS_MASKED); +} + +#undef P +#undef PS diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 9e32b3d35d35..b2ba59e73f21 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -13,9 +13,6 @@ # define IRQ_BITMAP_BITS NR_IRQS #endif -#include "compat.h" -#include "settings.h" - #define istate core_internal_state__do_not_mess_with_it extern int noirqdebug; @@ -66,6 +63,10 @@ enum { IRQS_WAKEUP = 0x00001000, }; +#include "compat.h" +#include "debug.h" +#include "settings.h" + #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) /* Set default functions for irq_chip structures: */ @@ -138,44 +139,3 @@ static inline void irqd_clr_move_pending(struct irq_data *d) d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING; irq_compat_clr_move_pending(irq_data_to_desc(d)); } - -/* - * Debugging printout: - */ - -#include - -#define P(f) if (desc->status & f) printk("%14s set\n", #f) -#define PS(f) if (desc->istate & f) printk("%14s set\n", #f) - -static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) -{ - printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", - irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); - printk("->handle_irq(): %p, ", desc->handle_irq); - print_symbol("%s\n", (unsigned long)desc->handle_irq); - printk("->irq_data.chip(): %p, ", desc->irq_data.chip); - print_symbol("%s\n", (unsigned long)desc->irq_data.chip); - printk("->action(): %p\n", desc->action); - if (desc->action) { - printk("->action->handler(): %p, ", desc->action->handler); - print_symbol("%s\n", (unsigned long)desc->action->handler); - } - - P(IRQ_LEVEL); - P(IRQ_PER_CPU); - P(IRQ_NOPROBE); - P(IRQ_NOREQUEST); - P(IRQ_NOAUTOEN); - - PS(IRQS_AUTODETECT); - PS(IRQS_INPROGRESS); - PS(IRQS_REPLAY); - PS(IRQS_WAITING); - PS(IRQS_DISABLED); - PS(IRQS_PENDING); - PS(IRQS_MASKED); -} - -#undef P -#undef PS From a005677b3dd05decdd8880cf3044ae709856f58f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Feb 2011 17:11:03 +0100 Subject: [PATCH 374/706] genirq: Mirror IRQ_PER_CPU and IRQ_NO_BALANCING in irq_data.state That's the right data structure to look at for arch code. Accessor functions are provided. irqd_is_per_cpu(irqdata); irqd_can_balance(irqdata); Coders who access them directly will be tracked down and slapped with stinking trouts. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 16 +++++++++++++++- kernel/irq/chip.c | 15 +++++++++------ kernel/irq/internals.h | 11 +++++++++++ kernel/irq/manage.c | 16 ++++++++++------ kernel/irq/migration.c | 2 +- kernel/irq/settings.h | 36 ++++++++++++++++++++++++++++++++++++ kernel/irq/spurious.c | 3 ++- 7 files changed, 84 insertions(+), 15 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index 3f607ad94220..d5312e6fe1aa 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -132,10 +132,14 @@ struct irq_data { * Bit masks for irq_data.state * * IRQD_SETAFFINITY_PENDING - Affinity setting is pending + * IRQD_NO_BALANCING - Balancing disabled for this IRQ + * IRQD_PER_CPU - Interrupt is per cpu */ enum { /* Bit 0 - 7 reserved for TYPE will use later */ - IRQD_SETAFFINITY_PENDING = (1 << 8), + IRQD_SETAFFINITY_PENDING = (1 << 8), + IRQD_NO_BALANCING = (1 << 10), + IRQD_PER_CPU = (1 << 11), }; static inline bool irqd_is_setaffinity_pending(struct irq_data *d) @@ -143,6 +147,16 @@ static inline bool irqd_is_setaffinity_pending(struct irq_data *d) return d->state_use_accessors & IRQD_SETAFFINITY_PENDING; } +static inline bool irqd_is_per_cpu(struct irq_data *d) +{ + return d->state_use_accessors & IRQD_PER_CPU; +} + +static inline bool irqd_can_balance(struct irq_data *d) +{ + return !(d->state_use_accessors & (IRQD_PER_CPU | IRQD_NO_BALANCING)); +} + /** * struct irq_chip - hardware interrupt chip descriptor * diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 73b2e7e00934..b8aa3dfe8301 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -706,12 +706,15 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) if (!desc) return; - /* Sanitize flags */ - set &= IRQF_MODIFY_MASK; - clr &= IRQF_MODIFY_MASK; - raw_spin_lock_irqsave(&desc->lock, flags); - desc->status &= ~clr; - desc->status |= set; + + irq_settings_clr_and_set(desc, clr, set); + + irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU); + if (irq_settings_has_no_balance_set(desc)) + irqd_set(&desc->irq_data, IRQD_NO_BALANCING); + if (irq_settings_is_per_cpu(desc)) + irqd_set(&desc->irq_data, IRQD_PER_CPU); + raw_spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index b2ba59e73f21..a80b44d2735e 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -139,3 +139,14 @@ static inline void irqd_clr_move_pending(struct irq_data *d) d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING; irq_compat_clr_move_pending(irq_data_to_desc(d)); } + +static inline void irqd_clear(struct irq_data *d, unsigned int mask) +{ + d->state_use_accessors &= ~mask; +} + +static inline void irqd_set(struct irq_data *d, unsigned int mask) +{ + d->state_use_accessors |= mask; +} + diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f1cfa271ba70..84a0a9c22226 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -73,8 +73,8 @@ int irq_can_set_affinity(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - if ((desc->status & (IRQ_PER_CPU | IRQ_NO_BALANCING)) || - !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) + if (!irqd_can_balance(&desc->irq_data) || !desc->irq_data.chip || + !desc->irq_data.chip->irq_set_affinity) return 0; return 1; @@ -897,8 +897,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) IRQS_INPROGRESS | IRQS_ONESHOT | \ IRQS_WAITING); - if (new->flags & IRQF_PERCPU) - desc->status |= IRQ_PER_CPU; + if (new->flags & IRQF_PERCPU) { + irqd_set(&desc->irq_data, IRQD_PER_CPU); + irq_settings_set_per_cpu(desc); + } if (new->flags & IRQF_ONESHOT) desc->istate |= IRQS_ONESHOT; @@ -910,8 +912,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) desc->depth = 1; /* Exclude IRQ from balancing if requested */ - if (new->flags & IRQF_NOBALANCING) - desc->status |= IRQ_NO_BALANCING; + if (new->flags & IRQF_NOBALANCING) { + irq_settings_set_no_balancing(desc); + irqd_set(&desc->irq_data, IRQD_NO_BALANCING); + } /* Set default affinity mask once everything is setup */ setup_affinity(irq, desc, mask); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 24f53caddf47..7a93c6b88b25 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -15,7 +15,7 @@ void move_masked_irq(int irq) /* * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. */ - if (desc->status & (IRQ_PER_CPU | IRQ_NO_BALANCING)) { + if (!irqd_can_balance(&desc->irq_data)) { WARN_ON(1); return; } diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index bb104a2dce73..ba0fffe410ad 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -4,6 +4,9 @@ */ enum { _IRQ_DEFAULT_INIT_FLAGS = IRQ_DEFAULT_INIT_FLAGS, + _IRQ_PER_CPU = IRQ_PER_CPU, + _IRQ_NO_BALANCING = IRQ_NO_BALANCING, + _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, }; #undef IRQ_INPROGRESS @@ -22,3 +25,36 @@ enum { #define IRQ_WAKEUP GOT_YOU_MORON #undef IRQ_MOVE_PENDING #define IRQ_MOVE_PENDING GOT_YOU_MORON +#undef IRQ_PER_CPU +#define IRQ_PER_CPU GOT_YOU_MORON +#undef IRQ_NO_BALANCING +#define IRQ_NO_BALANCING GOT_YOU_MORON +#undef IRQF_MODIFY_MASK +#define IRQF_MODIFY_MASK GOT_YOU_MORON + +static inline void +irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set) +{ + desc->status &= ~(clr & _IRQF_MODIFY_MASK); + desc->status |= (set & _IRQF_MODIFY_MASK); +} + +static inline bool irq_settings_is_per_cpu(struct irq_desc *desc) +{ + return desc->status & _IRQ_PER_CPU; +} + +static inline void irq_settings_set_per_cpu(struct irq_desc *desc) +{ + desc->status |= _IRQ_PER_CPU; +} + +static inline void irq_settings_set_no_balancing(struct irq_desc *desc) +{ + desc->status |= _IRQ_NO_BALANCING; +} + +static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc) +{ + return desc->status & _IRQ_NO_BALANCING; +} diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 692ce2bae302..226ed7d26a84 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -68,7 +68,8 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) raw_spin_lock(&desc->lock); /* PER_CPU and nested thread interrupts are never polled */ - if (desc->status & (IRQ_PER_CPU | IRQ_NESTED_THREAD)) + if (irq_settings_is_per_cpu(desc) || + (desc->status & IRQ_NESTED_THREAD)) goto out; /* From bce43032ad79fae0ce5b6174ce1321e643ceb54b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Feb 2011 22:37:41 +0100 Subject: [PATCH 375/706] genirq: Reuse existing can set affinty check Add a !desc check while at it. Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 4 ++-- kernel/irq/proc.c | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 84a0a9c22226..550ae97a0040 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -73,8 +73,8 @@ int irq_can_set_affinity(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - if (!irqd_can_balance(&desc->irq_data) || !desc->irq_data.chip || - !desc->irq_data.chip->irq_set_affinity) + if (!desc || !irqd_can_balance(&desc->irq_data) || + !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) return 0; return 1; diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index afe4e6803148..4cc2e5ed0bec 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -66,8 +66,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, cpumask_var_t new_value; int err; - if (!irq_to_desc(irq)->irq_data.chip->irq_set_affinity || no_irq_affinity || - irq_balancing_disabled(irq)) + if (!irq_can_set_affinity(irq) || no_irq_affinity) return -EIO; if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) From 2bdd10558c8d93009cb6c32ce9e30800fbb08add Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Feb 2011 17:22:00 +0100 Subject: [PATCH 376/706] genirq: Move IRQ_AFFINITY_SET to core Keep status in sync until last abuser is gone. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 9 ++++++++- kernel/irq/compat.h | 11 +++++++++++ kernel/irq/internals.h | 4 ++++ kernel/irq/manage.c | 11 +++++++---- kernel/irq/settings.h | 2 ++ 5 files changed, 32 insertions(+), 5 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index d5312e6fe1aa..8da1782ecfca 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -60,6 +60,7 @@ typedef void (*irq_flow_handler_t)(unsigned int irq, #define IRQ_MASKED 0x00002000 /* DEPRECATED */ /* DEPRECATED use irq_setaffinity_pending() instead*/ #define IRQ_MOVE_PENDING 0x00004000 +#define IRQ_AFFINITY_SET 0x02000000 /* DEPRECATED */ #endif #define IRQ_LEVEL 0x00008000 /* IRQ level triggered */ @@ -70,7 +71,6 @@ typedef void (*irq_flow_handler_t)(unsigned int irq, #define IRQ_WAKEUP 0x00100000 /* IRQ triggers system wakeup */ #define IRQ_NO_BALANCING 0x00400000 /* IRQ is excluded from balancing */ #define IRQ_MOVE_PCNTXT 0x01000000 /* IRQ migration from process context */ -#define IRQ_AFFINITY_SET 0x02000000 /* IRQ affinity was set from userspace*/ #define IRQ_NESTED_THREAD 0x10000000 /* IRQ is nested into another, no own handler thread */ #define IRQF_MODIFY_MASK \ @@ -134,12 +134,14 @@ struct irq_data { * IRQD_SETAFFINITY_PENDING - Affinity setting is pending * IRQD_NO_BALANCING - Balancing disabled for this IRQ * IRQD_PER_CPU - Interrupt is per cpu + * IRQD_AFFINITY_SET - Interrupt affinity was set */ enum { /* Bit 0 - 7 reserved for TYPE will use later */ IRQD_SETAFFINITY_PENDING = (1 << 8), IRQD_NO_BALANCING = (1 << 10), IRQD_PER_CPU = (1 << 11), + IRQD_AFFINITY_SET = (1 << 12), }; static inline bool irqd_is_setaffinity_pending(struct irq_data *d) @@ -157,6 +159,11 @@ static inline bool irqd_can_balance(struct irq_data *d) return !(d->state_use_accessors & (IRQD_PER_CPU | IRQD_NO_BALANCING)); } +static inline bool irqd_affinity_was_set(struct irq_data *d) +{ + return d->state_use_accessors & IRQD_AFFINITY_SET; +} + /** * struct irq_chip - hardware interrupt chip descriptor * diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h index 5e33aadadacc..6bbaf66aca85 100644 --- a/kernel/irq/compat.h +++ b/kernel/irq/compat.h @@ -46,6 +46,15 @@ static inline void irq_compat_clr_move_pending(struct irq_desc *desc) { desc->status &= ~IRQ_MOVE_PENDING; } +static inline void irq_compat_set_affinity(struct irq_desc *desc) +{ + desc->status |= IRQ_AFFINITY_SET; +} + +static inline void irq_compat_clr_affinity(struct irq_desc *desc) +{ + desc->status &= ~IRQ_AFFINITY_SET; +} #else static inline void irq_compat_set_progress(struct irq_desc *desc) { } static inline void irq_compat_clr_progress(struct irq_desc *desc) { } @@ -57,5 +66,7 @@ static inline void irq_compat_set_masked(struct irq_desc *desc) { } static inline void irq_compat_clr_masked(struct irq_desc *desc) { } static inline void irq_compat_set_move_pending(struct irq_desc *desc) { } static inline void irq_compat_clr_move_pending(struct irq_desc *desc) { } +static inline void irq_compat_set_affinity(struct irq_desc *desc) { } +static inline void irq_compat_clr_affinity(struct irq_desc *desc) { } #endif diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index a80b44d2735e..6776453c454c 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -150,3 +150,7 @@ static inline void irqd_set(struct irq_data *d, unsigned int mask) d->state_use_accessors |= mask; } +static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) +{ + return d->state_use_accessors & mask; +} diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 550ae97a0040..8246afc81956 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -164,7 +164,8 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask) kref_get(&desc->affinity_notify->kref); schedule_work(&desc->affinity_notify->work); } - desc->status |= IRQ_AFFINITY_SET; + irq_compat_set_affinity(desc); + irqd_set(&desc->irq_data, IRQD_AFFINITY_SET); raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } @@ -272,12 +273,14 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) * Preserve an userspace affinity setup, but make sure that * one of the targets is online. */ - if (desc->status & (IRQ_AFFINITY_SET)) { + if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { if (cpumask_intersects(desc->irq_data.affinity, cpu_online_mask)) set = desc->irq_data.affinity; - else - desc->status &= ~IRQ_AFFINITY_SET; + else { + irq_compat_clr_affinity(desc); + irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET); + } } cpumask_and(mask, cpu_online_mask, set); diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index ba0fffe410ad..da5acb446b1c 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -29,6 +29,8 @@ enum { #define IRQ_PER_CPU GOT_YOU_MORON #undef IRQ_NO_BALANCING #define IRQ_NO_BALANCING GOT_YOU_MORON +#undef IRQ_AFFINITY_SET +#define IRQ_AFFINITY_SET GOT_YOU_MORON #undef IRQF_MODIFY_MASK #define IRQF_MODIFY_MASK GOT_YOU_MORON From 876dbd4cc1b35c1a4cb96a2be1d43ea0eabce3b4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Feb 2011 17:28:12 +0100 Subject: [PATCH 377/706] genirq: Mirror irq trigger type bits in irq_data.state That's the data structure chip functions get provided. Also allow them to signal the core code that they updated the flags in irq_data.state by returning IRQ_SET_MASK_OK_NOCOPY. The default is unchanged. The type bits should be accessed via: val = irqd_get_trigger_type(irqdata); and irqd_set_trigger_type(irqdata, val); Coders who access them directly will be tracked down and slapped with stinking trouts. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 26 ++++++++++++++++++++++++- kernel/irq/chip.c | 5 ++++- kernel/irq/manage.c | 44 ++++++++++++++++++++++++++----------------- kernel/irq/resend.c | 2 +- kernel/irq/settings.h | 30 +++++++++++++++++++++++++++++ 5 files changed, 87 insertions(+), 20 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index 8da1782ecfca..be73c0a3c19d 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -46,7 +46,9 @@ typedef void (*irq_flow_handler_t)(unsigned int irq, #define IRQ_TYPE_EDGE_BOTH (IRQ_TYPE_EDGE_FALLING | IRQ_TYPE_EDGE_RISING) #define IRQ_TYPE_LEVEL_HIGH 0x00000004 /* Level high type */ #define IRQ_TYPE_LEVEL_LOW 0x00000008 /* Level low type */ +#define IRQ_TYPE_LEVEL_MASK (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH) #define IRQ_TYPE_SENSE_MASK 0x0000000f /* Mask of the above */ + #define IRQ_TYPE_PROBE 0x00000010 /* Probing in progress */ /* Internal flags */ @@ -131,17 +133,20 @@ struct irq_data { /* * Bit masks for irq_data.state * + * IRQD_TRIGGER_MASK - Mask for the trigger type bits * IRQD_SETAFFINITY_PENDING - Affinity setting is pending * IRQD_NO_BALANCING - Balancing disabled for this IRQ * IRQD_PER_CPU - Interrupt is per cpu * IRQD_AFFINITY_SET - Interrupt affinity was set + * IRQD_LEVEL - Interrupt is level triggered */ enum { - /* Bit 0 - 7 reserved for TYPE will use later */ + IRQD_TRIGGER_MASK = 0xf, IRQD_SETAFFINITY_PENDING = (1 << 8), IRQD_NO_BALANCING = (1 << 10), IRQD_PER_CPU = (1 << 11), IRQD_AFFINITY_SET = (1 << 12), + IRQD_LEVEL = (1 << 13), }; static inline bool irqd_is_setaffinity_pending(struct irq_data *d) @@ -164,6 +169,25 @@ static inline bool irqd_affinity_was_set(struct irq_data *d) return d->state_use_accessors & IRQD_AFFINITY_SET; } +static inline u32 irqd_get_trigger_type(struct irq_data *d) +{ + return d->state_use_accessors & IRQD_TRIGGER_MASK; +} + +/* + * Must only be called inside irq_chip.irq_set_type() functions. + */ +static inline void irqd_set_trigger_type(struct irq_data *d, u32 type) +{ + d->state_use_accessors &= ~IRQD_TRIGGER_MASK; + d->state_use_accessors |= type & IRQD_TRIGGER_MASK; +} + +static inline bool irqd_is_level_type(struct irq_data *d) +{ + return d->state_use_accessors & IRQD_LEVEL; +} + /** * struct irq_chip - hardware interrupt chip descriptor * diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b8aa3dfe8301..9c9b573a718e 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -710,11 +710,14 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) irq_settings_clr_and_set(desc, clr, set); - irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU); + irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | + IRQD_TRIGGER_MASK | IRQD_LEVEL); if (irq_settings_has_no_balance_set(desc)) irqd_set(&desc->irq_data, IRQD_NO_BALANCING); if (irq_settings_is_per_cpu(desc)) irqd_set(&desc->irq_data, IRQD_PER_CPU); + irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc)); + raw_spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 8246afc81956..9ae758ed8e66 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -567,23 +567,32 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, return 0; } + flags &= IRQ_TYPE_SENSE_MASK; /* caller masked out all except trigger mode flags */ ret = chip->irq_set_type(&desc->irq_data, flags); - if (ret) - pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", - flags, irq, chip->irq_set_type); - else { - if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH)) - flags |= IRQ_LEVEL; - /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */ - desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); - desc->status |= flags; + switch (ret) { + case IRQ_SET_MASK_OK: + irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); + irqd_set(&desc->irq_data, flags); + + case IRQ_SET_MASK_OK_NOCOPY: + flags = irqd_get_trigger_type(&desc->irq_data); + irq_settings_set_trigger_mask(desc, flags); + irqd_clear(&desc->irq_data, IRQD_LEVEL); + irq_settings_clr_level(desc); + if (flags & IRQ_TYPE_LEVEL_MASK) { + irq_settings_set_level(desc); + irqd_set(&desc->irq_data, IRQD_LEVEL); + } if (chip != desc->irq_data.chip) irq_chip_set_defaults(desc->irq_data.chip); + return 0; + default: + pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", + flags, irq, chip->irq_set_type); } - return ret; } @@ -923,13 +932,14 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) /* Set default affinity mask once everything is setup */ setup_affinity(irq, desc, mask); - } else if ((new->flags & IRQF_TRIGGER_MASK) - && (new->flags & IRQF_TRIGGER_MASK) - != (desc->status & IRQ_TYPE_SENSE_MASK)) { - /* hope the handler works with the actual trigger mode... */ - pr_warning("IRQ %d uses trigger mode %d; requested %d\n", - irq, (int)(desc->status & IRQ_TYPE_SENSE_MASK), - (int)(new->flags & IRQF_TRIGGER_MASK)); + } else if (new->flags & IRQF_TRIGGER_MASK) { + unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; + unsigned int omsk = irq_settings_get_trigger_mask(desc); + + if (nmsk != omsk) + /* hope the handler works with current trigger mode */ + pr_warning("IRQ %d uses trigger mode %u; requested %u\n", + irq, nmsk, omsk); } new->irq = irq; diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index ff1fea060014..ad683a99b1ec 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -60,7 +60,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) * interrupts are resent by hardware when they are still * active. */ - if (desc->status & IRQ_LEVEL) + if (irq_settings_is_level(desc)) return; if (desc->istate & IRQS_REPLAY) return; diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index da5acb446b1c..2201f2aaa9a0 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -5,6 +5,7 @@ enum { _IRQ_DEFAULT_INIT_FLAGS = IRQ_DEFAULT_INIT_FLAGS, _IRQ_PER_CPU = IRQ_PER_CPU, + _IRQ_LEVEL = IRQ_LEVEL, _IRQ_NO_BALANCING = IRQ_NO_BALANCING, _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, }; @@ -31,6 +32,8 @@ enum { #define IRQ_NO_BALANCING GOT_YOU_MORON #undef IRQ_AFFINITY_SET #define IRQ_AFFINITY_SET GOT_YOU_MORON +#undef IRQ_LEVEL +#define IRQ_LEVEL GOT_YOU_MORON #undef IRQF_MODIFY_MASK #define IRQF_MODIFY_MASK GOT_YOU_MORON @@ -60,3 +63,30 @@ static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc) { return desc->status & _IRQ_NO_BALANCING; } + +static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc) +{ + return desc->status & IRQ_TYPE_SENSE_MASK; +} + +static inline void +irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask) +{ + desc->status &= ~IRQ_TYPE_SENSE_MASK; + desc->status |= mask & IRQ_TYPE_SENSE_MASK; +} + +static inline bool irq_settings_is_level(struct irq_desc *desc) +{ + return desc->status & _IRQ_LEVEL; +} + +static inline void irq_settings_clr_level(struct irq_desc *desc) +{ + desc->status &= ~_IRQ_LEVEL; +} + +static inline void irq_settings_set_level(struct irq_desc *desc) +{ + desc->status |= _IRQ_LEVEL; +} From 1ccb4e612f68ceefb888c2c6c1def6294ea8666d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Feb 2011 14:44:17 +0100 Subject: [PATCH 378/706] genirq: Wrap the remaning IRQ_* flags Use wrappers to keep them away from the core code. Signed-off-by: Thomas Gleixner --- kernel/irq/autoprobe.c | 4 +-- kernel/irq/chip.c | 3 ++- kernel/irq/manage.c | 14 +++++----- kernel/irq/settings.h | 58 ++++++++++++++++++++++++++++++++++++++++++ kernel/irq/spurious.c | 3 +-- 5 files changed, 70 insertions(+), 12 deletions(-) diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index aab64c262726..c8bbc4fabaab 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -45,7 +45,7 @@ unsigned long probe_irq_on(void) */ for_each_irq_desc_reverse(i, desc) { raw_spin_lock_irq(&desc->lock); - if (!desc->action && !(desc->status & IRQ_NOPROBE)) { + if (!desc->action && irq_settings_can_probe(desc)) { /* * An old-style architecture might still have * the handle_bad_irq handler there: @@ -74,7 +74,7 @@ unsigned long probe_irq_on(void) */ for_each_irq_desc_reverse(i, desc) { raw_spin_lock_irq(&desc->lock); - if (!desc->action && !(desc->status & IRQ_NOPROBE)) { + if (!desc->action && irq_settings_can_probe(desc)) { desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; if (irq_startup(desc)) { irq_compat_set_pending(desc); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 9c9b573a718e..9e9220da4deb 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -674,7 +674,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, desc->name = name; if (handle != handle_bad_irq && is_chained) { - desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; + irq_settings_set_noprobe(desc); + irq_settings_set_norequest(desc); irq_startup(desc); } raw_spin_unlock_irqrestore(&desc->lock, flags); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 9ae758ed8e66..b5de828e58d9 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -103,7 +103,7 @@ void irq_set_thread_affinity(struct irq_desc *desc) #ifdef CONFIG_GENERIC_PENDING_IRQ static inline bool irq_can_move_pcntxt(struct irq_desc *desc) { - return desc->status & IRQ_MOVE_PCNTXT; + return irq_settings_can_move_pcntxt(desc); } static inline bool irq_move_pending(struct irq_desc *desc) { @@ -411,7 +411,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) if (desc->istate & IRQS_SUSPENDED) goto err_out; /* Prevent probing on this irq: */ - desc->status |= IRQ_NOPROBE; + irq_settings_set_noprobe(desc); irq_enable(desc); check_irq_resend(desc, irq); /* fall-through */ @@ -526,7 +526,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) if (!desc) return 0; - if (desc->status & IRQ_NOREQUEST) + if (!irq_settings_can_request(desc)) return 0; raw_spin_lock_irqsave(&desc->lock, flags); @@ -820,7 +820,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * Check whether the interrupt nests into another interrupt * thread. */ - nested = desc->status & IRQ_NESTED_THREAD; + nested = irq_settings_is_nested_thread(desc); if (nested) { if (!new->thread_fn) return -EINVAL; @@ -917,7 +917,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (new->flags & IRQF_ONESHOT) desc->istate |= IRQS_ONESHOT; - if (!(desc->status & IRQ_NOAUTOEN)) + if (irq_settings_can_autoenable(desc)) irq_startup(desc); else /* Undo nested disables: */ @@ -1217,7 +1217,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, if (!desc) return -EINVAL; - if (desc->status & IRQ_NOREQUEST) + if (!irq_settings_can_request(desc)) return -EINVAL; if (!handler) { @@ -1292,7 +1292,7 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler, if (!desc) return -EINVAL; - if (desc->status & IRQ_NESTED_THREAD) { + if (irq_settings_is_nested_thread(desc)) { ret = request_threaded_irq(irq, NULL, handler, flags, name, dev_id); return !ret ? IRQC_IS_NESTED : ret; diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 2201f2aaa9a0..216b6f200e7c 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -6,7 +6,12 @@ enum { _IRQ_DEFAULT_INIT_FLAGS = IRQ_DEFAULT_INIT_FLAGS, _IRQ_PER_CPU = IRQ_PER_CPU, _IRQ_LEVEL = IRQ_LEVEL, + _IRQ_NOPROBE = IRQ_NOPROBE, + _IRQ_NOREQUEST = IRQ_NOREQUEST, + _IRQ_NOAUTOEN = IRQ_NOAUTOEN, + _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT, _IRQ_NO_BALANCING = IRQ_NO_BALANCING, + _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, }; @@ -34,6 +39,14 @@ enum { #define IRQ_AFFINITY_SET GOT_YOU_MORON #undef IRQ_LEVEL #define IRQ_LEVEL GOT_YOU_MORON +#undef IRQ_NOPROBE +#define IRQ_NOPROBE GOT_YOU_MORON +#undef IRQ_NOREQUEST +#define IRQ_NOREQUEST GOT_YOU_MORON +#undef IRQ_NOAUTOEN +#define IRQ_NOAUTOEN GOT_YOU_MORON +#undef IRQ_NESTED_THREAD +#define IRQ_NESTED_THREAD GOT_YOU_MORON #undef IRQF_MODIFY_MASK #define IRQF_MODIFY_MASK GOT_YOU_MORON @@ -90,3 +103,48 @@ static inline void irq_settings_set_level(struct irq_desc *desc) { desc->status |= _IRQ_LEVEL; } + +static inline bool irq_settings_can_request(struct irq_desc *desc) +{ + return !(desc->status & _IRQ_NOREQUEST); +} + +static inline void irq_settings_clr_norequest(struct irq_desc *desc) +{ + desc->status &= ~_IRQ_NOREQUEST; +} + +static inline void irq_settings_set_norequest(struct irq_desc *desc) +{ + desc->status |= _IRQ_NOREQUEST; +} + +static inline bool irq_settings_can_probe(struct irq_desc *desc) +{ + return !(desc->status & _IRQ_NOPROBE); +} + +static inline void irq_settings_clr_noprobe(struct irq_desc *desc) +{ + desc->status &= ~_IRQ_NOPROBE; +} + +static inline void irq_settings_set_noprobe(struct irq_desc *desc) +{ + desc->status |= _IRQ_NOPROBE; +} + +static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc) +{ + return desc->status & _IRQ_MOVE_PCNTXT; +} + +static inline bool irq_settings_can_autoenable(struct irq_desc *desc) +{ + return !(desc->status & _IRQ_NOAUTOEN); +} + +static inline bool irq_settings_is_nested_thread(struct irq_desc *desc) +{ + return desc->status & _IRQ_NESTED_THREAD; +} diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 226ed7d26a84..dd586ebf9c8c 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -68,8 +68,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) raw_spin_lock(&desc->lock); /* PER_CPU and nested thread interrupts are never polled */ - if (irq_settings_is_per_cpu(desc) || - (desc->status & IRQ_NESTED_THREAD)) + if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc)) goto out; /* From f9e4989eb8183a1f33581fa1b99274287b0639d2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Feb 2011 14:54:49 +0100 Subject: [PATCH 379/706] genirq: Force wrapped access to desc->status in core code Force the usage of wrappers by another nasty CPP substitution. Signed-off-by: Thomas Gleixner --- kernel/irq/handle.c | 6 +++--- kernel/irq/irqdesc.c | 4 ++-- kernel/irq/settings.h | 3 +++ 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 6e34bdbeb26a..cb62e2d0df4e 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -55,7 +55,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) { irqreturn_t ret, retval = IRQ_NONE; - unsigned int status = 0, irq = desc->irq_data.irq; + unsigned int random = 0, irq = desc->irq_data.irq; do { trace_irq_handler_entry(irq, action); @@ -98,7 +98,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) /* Fall through to add to randomness */ case IRQ_HANDLED: - status |= action->flags; + random |= action->flags; break; default: @@ -109,7 +109,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) action = action->next; } while (action); - if (status & IRQF_SAMPLE_RANDOM) + if (random & IRQF_SAMPLE_RANDOM) add_interrupt_randomness(irq); if (!noirqdebug) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 78866d050bc9..3387fbd7f2f4 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -79,7 +79,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) desc->irq_data.chip_data = NULL; desc->irq_data.handler_data = NULL; desc->irq_data.msi_desc = NULL; - desc->status = _IRQ_DEFAULT_INIT_FLAGS; + irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); desc->istate = IRQS_DISABLED; desc->handle_irq = handle_bad_irq; desc->depth = 1; @@ -247,7 +247,6 @@ int __init early_irq_init(void) struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { - .status = _IRQ_DEFAULT_INIT_FLAGS, .istate = IRQS_DISABLED, .handle_irq = handle_bad_irq, .depth = 1, @@ -271,6 +270,7 @@ int __init early_irq_init(void) desc[i].irq_data.irq = i; desc[i].irq_data.chip = &no_irq_chip; desc[i].kstat_irqs = alloc_percpu(unsigned int); + irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); alloc_masks(desc + i, GFP_KERNEL, node); desc_smp_init(desc + i, node); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 216b6f200e7c..47bcd3b9f399 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -148,3 +148,6 @@ static inline bool irq_settings_is_nested_thread(struct irq_desc *desc) { return desc->status & _IRQ_NESTED_THREAD; } + +/* Nothing should touch desc->status from now on */ +#define status USE_THE_PROPER_WRAPPERS_YOU_MORON From 5d4d8fc9ac3e9a90bbdf90bae6864cb2c01f2208 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Feb 2011 17:27:18 +0100 Subject: [PATCH 380/706] genirq: Cleanup irq.h Put the constants into an enum and document them. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 91 ++++++++++++++++++++++++++++--------------- kernel/irq/settings.h | 16 -------- 2 files changed, 60 insertions(+), 47 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index be73c0a3c19d..2e3d1e5f0408 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -36,44 +36,73 @@ typedef void (*irq_flow_handler_t)(unsigned int irq, /* * IRQ line status. * - * Bits 0-7 are reserved for the IRQF_* bits in linux/interrupt.h + * Bits 0-7 are the same as the IRQF_* bits in linux/interrupt.h + * + * IRQ_TYPE_NONE - default, unspecified type + * IRQ_TYPE_EDGE_RISING - rising edge triggered + * IRQ_TYPE_EDGE_FALLING - falling edge triggered + * IRQ_TYPE_EDGE_BOTH - rising and falling edge triggered + * IRQ_TYPE_LEVEL_HIGH - high level triggered + * IRQ_TYPE_LEVEL_LOW - low level triggered + * IRQ_TYPE_LEVEL_MASK - Mask to filter out the level bits + * IRQ_TYPE_SENSE_MASK - Mask for all the above bits + * IRQ_TYPE_PROBE - Special flag for probing in progress + * + * Bits which can be modified via irq_set/clear/modify_status_flags() + * IRQ_LEVEL - Interrupt is level type. Will be also + * updated in the code when the above trigger + * bits are modified via set_irq_type() + * IRQ_PER_CPU - Mark an interrupt PER_CPU. Will protect + * it from affinity setting + * IRQ_NOPROBE - Interrupt cannot be probed by autoprobing + * IRQ_NOREQUEST - Interrupt cannot be requested via + * request_irq() + * IRQ_NOAUTOEN - Interrupt is not automatically enabled in + * request/setup_irq() + * IRQ_NO_BALANCING - Interrupt cannot be balanced (affinity set) + * IRQ_MOVE_PCNTXT - Interrupt can be migrated from process context + * IRQ_NESTED_TRHEAD - Interrupt nests into another thread + * + * Deprecated bits. They are kept updated as long as + * CONFIG_GENERIC_HARDIRQS_NO_COMPAT is not set. Will go away soon. These bits + * are internal state of the core code and if you really need to acces + * them then talk to the genirq maintainer instead of hacking + * something weird. * - * IRQ types */ -#define IRQ_TYPE_NONE 0x00000000 /* Default, unspecified type */ -#define IRQ_TYPE_EDGE_RISING 0x00000001 /* Edge rising type */ -#define IRQ_TYPE_EDGE_FALLING 0x00000002 /* Edge falling type */ -#define IRQ_TYPE_EDGE_BOTH (IRQ_TYPE_EDGE_FALLING | IRQ_TYPE_EDGE_RISING) -#define IRQ_TYPE_LEVEL_HIGH 0x00000004 /* Level high type */ -#define IRQ_TYPE_LEVEL_LOW 0x00000008 /* Level low type */ -#define IRQ_TYPE_LEVEL_MASK (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH) -#define IRQ_TYPE_SENSE_MASK 0x0000000f /* Mask of the above */ +enum { + IRQ_TYPE_NONE = 0x00000000, + IRQ_TYPE_EDGE_RISING = 0x00000001, + IRQ_TYPE_EDGE_FALLING = 0x00000002, + IRQ_TYPE_EDGE_BOTH = (IRQ_TYPE_EDGE_FALLING | IRQ_TYPE_EDGE_RISING), + IRQ_TYPE_LEVEL_HIGH = 0x00000004, + IRQ_TYPE_LEVEL_LOW = 0x00000008, + IRQ_TYPE_LEVEL_MASK = (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH), + IRQ_TYPE_SENSE_MASK = 0x0000000f, -#define IRQ_TYPE_PROBE 0x00000010 /* Probing in progress */ + IRQ_TYPE_PROBE = 0x00000010, -/* Internal flags */ + IRQ_LEVEL = (1 << 8), + IRQ_PER_CPU = (1 << 9), + IRQ_NOPROBE = (1 << 10), + IRQ_NOREQUEST = (1 << 11), + IRQ_NOAUTOEN = (1 << 12), + IRQ_NO_BALANCING = (1 << 13), + IRQ_MOVE_PCNTXT = (1 << 14), + IRQ_NESTED_THREAD = (1 << 15), #ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT -#define IRQ_INPROGRESS 0x00000100 /* DEPRECATED */ -#define IRQ_REPLAY 0x00000200 /* DEPRECATED */ -#define IRQ_WAITING 0x00000400 /* DEPRECATED */ -#define IRQ_DISABLED 0x00000800 /* DEPRECATED */ -#define IRQ_PENDING 0x00001000 /* DEPRECATED */ -#define IRQ_MASKED 0x00002000 /* DEPRECATED */ -/* DEPRECATED use irq_setaffinity_pending() instead*/ -#define IRQ_MOVE_PENDING 0x00004000 -#define IRQ_AFFINITY_SET 0x02000000 /* DEPRECATED */ + IRQ_INPROGRESS = (1 << 16), + IRQ_REPLAY = (1 << 17), + IRQ_WAITING = (1 << 18), + IRQ_DISABLED = (1 << 19), + IRQ_PENDING = (1 << 20), + IRQ_MASKED = (1 << 21), + IRQ_MOVE_PENDING = (1 << 22), + IRQ_AFFINITY_SET = (1 << 23), + IRQ_WAKEUP = (1 << 24), #endif - -#define IRQ_LEVEL 0x00008000 /* IRQ level triggered */ -#define IRQ_PER_CPU 0x00010000 /* IRQ is per CPU */ -#define IRQ_NOPROBE 0x00020000 /* IRQ is not valid for probing */ -#define IRQ_NOREQUEST 0x00040000 /* IRQ cannot be requested */ -#define IRQ_NOAUTOEN 0x00080000 /* IRQ will not be enabled on request irq */ -#define IRQ_WAKEUP 0x00100000 /* IRQ triggers system wakeup */ -#define IRQ_NO_BALANCING 0x00400000 /* IRQ is excluded from balancing */ -#define IRQ_MOVE_PCNTXT 0x01000000 /* IRQ migration from process context */ -#define IRQ_NESTED_THREAD 0x10000000 /* IRQ is nested into another, no own handler thread */ +}; #define IRQF_MODIFY_MASK \ (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \ diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 47bcd3b9f399..55ebe1e09da4 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -15,37 +15,21 @@ enum { _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, }; -#undef IRQ_INPROGRESS #define IRQ_INPROGRESS GOT_YOU_MORON -#undef IRQ_REPLAY #define IRQ_REPLAY GOT_YOU_MORON -#undef IRQ_WAITING #define IRQ_WAITING GOT_YOU_MORON -#undef IRQ_DISABLED #define IRQ_DISABLED GOT_YOU_MORON -#undef IRQ_PENDING #define IRQ_PENDING GOT_YOU_MORON -#undef IRQ_MASKED #define IRQ_MASKED GOT_YOU_MORON -#undef IRQ_WAKEUP #define IRQ_WAKEUP GOT_YOU_MORON -#undef IRQ_MOVE_PENDING #define IRQ_MOVE_PENDING GOT_YOU_MORON -#undef IRQ_PER_CPU #define IRQ_PER_CPU GOT_YOU_MORON -#undef IRQ_NO_BALANCING #define IRQ_NO_BALANCING GOT_YOU_MORON -#undef IRQ_AFFINITY_SET #define IRQ_AFFINITY_SET GOT_YOU_MORON -#undef IRQ_LEVEL #define IRQ_LEVEL GOT_YOU_MORON -#undef IRQ_NOPROBE #define IRQ_NOPROBE GOT_YOU_MORON -#undef IRQ_NOREQUEST #define IRQ_NOREQUEST GOT_YOU_MORON -#undef IRQ_NOAUTOEN #define IRQ_NOAUTOEN GOT_YOU_MORON -#undef IRQ_NESTED_THREAD #define IRQ_NESTED_THREAD GOT_YOU_MORON #undef IRQF_MODIFY_MASK #define IRQF_MODIFY_MASK GOT_YOU_MORON From 2bff17ad2107c66fc8ca96501a7128dd7fa7a390 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Feb 2011 13:08:38 +0100 Subject: [PATCH 381/706] genirq: Add flags to irq_chip Looking through irq_chip implementations I noticed that some of them have special requirements, like setting the type masked and therefor fiddle in irq_desc->status. Add a flag field, so the core code can handle it. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/irq.h b/include/linux/irq.h index 2e3d1e5f0408..aefb30bbcf0e 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -253,6 +253,7 @@ static inline bool irqd_is_level_type(struct irq_data *d) * @irq_set_wake: enable/disable power-management wake-on of an IRQ * @irq_bus_lock: function to lock access to slow bus (i2c) chips * @irq_bus_sync_unlock:function to sync and unlock slow bus (i2c) chips + * @flags: chip specific flags * * @release: release function solely used by UML */ @@ -299,6 +300,8 @@ struct irq_chip { void (*irq_bus_lock)(struct irq_data *data); void (*irq_bus_sync_unlock)(struct irq_data *data); + unsigned long flags; + /* Currently used only by UML, might disappear one day.*/ #ifdef CONFIG_IRQ_RELEASE_METHOD void (*release)(unsigned int irq, void *dev_id); From d4d5e08960844a062da8387ee5f16ca7a33200d0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Feb 2011 13:16:14 +0100 Subject: [PATCH 382/706] genirq: Add IRQCHIP_SET_TYPE_MASKED flag irq_chips, which require to mask the chip before changing the trigger type should set this flag. So the core takes care of it and the requirement for looking into desc->status in the chip goes away. Signed-off-by: Thomas Gleixner Cc: Linus Walleij Cc: Lars-Peter Clausen --- include/linux/irq.h | 9 +++++++++ kernel/irq/chip.c | 4 ++-- kernel/irq/internals.h | 2 ++ kernel/irq/manage.c | 16 +++++++++++++--- 4 files changed, 26 insertions(+), 5 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index aefb30bbcf0e..ef6b66dc9d03 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -308,6 +308,15 @@ struct irq_chip { #endif }; +/* + * irq_chip specific flags + * + * IRQCHIP_SET_TYPE_MASKED: Mask before calling chip.irq_set_type() + */ +enum { + IRQCHIP_SET_TYPE_MASKED = (1 << 0), +}; + /* This include will go away once we isolated irq_desc usage to core code */ #include diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 9e9220da4deb..4687457fe7f0 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -367,7 +367,7 @@ static inline void mask_ack_irq(struct irq_desc *desc) irq_state_set_masked(desc); } -static inline void mask_irq(struct irq_desc *desc) +void mask_irq(struct irq_desc *desc) { if (desc->irq_data.chip->irq_mask) { desc->irq_data.chip->irq_mask(&desc->irq_data); @@ -375,7 +375,7 @@ static inline void mask_irq(struct irq_desc *desc) } } -static inline void unmask_irq(struct irq_desc *desc) +void unmask_irq(struct irq_desc *desc) { if (desc->irq_data.chip->irq_unmask) { desc->irq_data.chip->irq_unmask(&desc->irq_data); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 6776453c454c..1d500fbde0d4 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -84,6 +84,8 @@ extern int irq_startup(struct irq_desc *desc); extern void irq_shutdown(struct irq_desc *desc); extern void irq_enable(struct irq_desc *desc); extern void irq_disable(struct irq_desc *desc); +extern void mask_irq(struct irq_desc *desc); +extern void unmask_irq(struct irq_desc *desc); extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index b5de828e58d9..50809c79c7ad 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -554,8 +554,8 @@ void compat_irq_chip_set_default_handler(struct irq_desc *desc) int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, unsigned long flags) { - int ret; struct irq_chip *chip = desc->irq_data.chip; + int ret, unmask = 0; if (!chip || !chip->irq_set_type) { /* @@ -568,6 +568,14 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, } flags &= IRQ_TYPE_SENSE_MASK; + + if (chip->flags & IRQCHIP_SET_TYPE_MASKED) { + if (!(desc->istate & IRQS_MASKED)) + mask_irq(desc); + if (!(desc->istate & IRQS_DISABLED)) + unmask = 1; + } + /* caller masked out all except trigger mode flags */ ret = chip->irq_set_type(&desc->irq_data, flags); @@ -588,11 +596,13 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, if (chip != desc->irq_data.chip) irq_chip_set_defaults(desc->irq_data.chip); - return 0; + ret = 0; default: pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", flags, irq, chip->irq_set_type); } + if (unmask) + unmask_irq(desc); return ret; } @@ -669,7 +679,7 @@ again: #ifdef CONFIG_SMP /* - * Check whether we need to change the affinity of the interrupt thread. + * Check whether we need to chasnge the affinity of the interrupt thread. */ static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) From 7f94226f03299f1ca32f118f02f2a0295e0e5e93 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Feb 2011 19:46:26 +0100 Subject: [PATCH 383/706] genirq: Move wakeup state to irq_data Some irq_chips need to know the state of wakeup mode for setting the trigger type etc. Reflect it in irq_data state. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 8 ++++++++ kernel/irq/internals.h | 2 -- kernel/irq/manage.c | 4 ++-- kernel/irq/pm.c | 2 +- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index ef6b66dc9d03..94c8f5bb548f 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -168,6 +168,8 @@ struct irq_data { * IRQD_PER_CPU - Interrupt is per cpu * IRQD_AFFINITY_SET - Interrupt affinity was set * IRQD_LEVEL - Interrupt is level triggered + * IRQD_WAKEUP_STATE - Interrupt is configured for wakeup + * from suspend */ enum { IRQD_TRIGGER_MASK = 0xf, @@ -176,6 +178,7 @@ enum { IRQD_PER_CPU = (1 << 11), IRQD_AFFINITY_SET = (1 << 12), IRQD_LEVEL = (1 << 13), + IRQD_WAKEUP_STATE = (1 << 14), }; static inline bool irqd_is_setaffinity_pending(struct irq_data *d) @@ -217,6 +220,11 @@ static inline bool irqd_is_level_type(struct irq_data *d) return d->state_use_accessors & IRQD_LEVEL; } +static inline bool irqd_is_wakeup_set(struct irq_data *d) +{ + return d->state_use_accessors & IRQD_WAKEUP_STATE; +} + /** * struct irq_chip - hardware interrupt chip descriptor * diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 1d500fbde0d4..5e2366da9f38 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -46,7 +46,6 @@ enum { * IRQS_PENDING - irq is pending and replayed later * IRQS_MASKED - irq is masked * IRQS_SUSPENDED - irq is suspended - * IRQS_WAKEUP - irq triggers system wakeup from suspend */ enum { IRQS_AUTODETECT = 0x00000001, @@ -60,7 +59,6 @@ enum { IRQS_PENDING = 0x00000200, IRQS_MASKED = 0x00000400, IRQS_SUSPENDED = 0x00000800, - IRQS_WAKEUP = 0x00001000, }; #include "compat.h" diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 50809c79c7ad..ea6add6036b1 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -492,7 +492,7 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on) if (ret) desc->wake_depth = 0; else - desc->istate |= IRQS_WAKEUP; + irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE); } } else { if (desc->wake_depth == 0) { @@ -502,7 +502,7 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on) if (ret) desc->wake_depth = 1; else - desc->istate &= ~IRQS_WAKEUP; + irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE); } } diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index f39383d8672d..1329f0eff49e 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -69,7 +69,7 @@ int check_wakeup_irqs(void) int irq; for_each_irq_desc(irq, desc) - if ((desc->istate & IRQS_WAKEUP) && + if (irqd_is_wakeup_set(&desc->irq_data) && (desc->istate & IRQS_PENDING)) return -EBUSY; From e1ef824146131709d7466e37f889f2dab24ca98e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Feb 2011 22:25:31 +0100 Subject: [PATCH 384/706] genirq: Reflect IRQ_MOVE_PCNTXT in irq_data state Required by x86. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 8 ++++++++ kernel/irq/chip.c | 4 +++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index 94c8f5bb548f..c101ad4b821f 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -170,6 +170,8 @@ struct irq_data { * IRQD_LEVEL - Interrupt is level triggered * IRQD_WAKEUP_STATE - Interrupt is configured for wakeup * from suspend + * IRDQ_MOVE_PCNTXT - Interrupt can be moved in process + * context */ enum { IRQD_TRIGGER_MASK = 0xf, @@ -179,6 +181,7 @@ enum { IRQD_AFFINITY_SET = (1 << 12), IRQD_LEVEL = (1 << 13), IRQD_WAKEUP_STATE = (1 << 14), + IRQD_MOVE_PCNTXT = (1 << 15), }; static inline bool irqd_is_setaffinity_pending(struct irq_data *d) @@ -225,6 +228,11 @@ static inline bool irqd_is_wakeup_set(struct irq_data *d) return d->state_use_accessors & IRQD_WAKEUP_STATE; } +static inline bool irqd_can_move_in_process_context(struct irq_data *d) +{ + return d->state_use_accessors & IRQD_MOVE_PCNTXT; +} + /** * struct irq_chip - hardware interrupt chip descriptor * diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 4687457fe7f0..2b0f9192a830 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -712,11 +712,13 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) irq_settings_clr_and_set(desc, clr, set); irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | - IRQD_TRIGGER_MASK | IRQD_LEVEL); + IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT); if (irq_settings_has_no_balance_set(desc)) irqd_set(&desc->irq_data, IRQD_NO_BALANCING); if (irq_settings_is_per_cpu(desc)) irqd_set(&desc->irq_data, IRQD_PER_CPU); + if (irq_settings_can_move_pcntxt(desc)) + irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT); irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc)); From a6967caf00ebbb2d4acdebcb72a25f2e9ba43fd2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Feb 2011 22:01:25 +0100 Subject: [PATCH 385/706] genirq: Remove desc->status when GENERIC_HARDIRQS_NO_COMPAT=y If everything uses the right accessors, then enabling GENERIC_HARDIRQS_NO_COMPAT should just work. If not it will tell you. Don't be lazy and use the trick which I use in the core code! git grep status_use_accessors will unearth it in a split second. Offenders are tracked down and not slapped with stinking trouts. This time we use frozen shark for a better educational value. Signed-off-by: Thomas Gleixner --- include/linux/irqdesc.h | 6 ++++++ kernel/irq/internals.h | 4 ++++ kernel/irq/settings.h | 1 + 3 files changed, 11 insertions(+) diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 581d9665fd38..36c95f08023d 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -64,7 +64,11 @@ struct irq_desc { unsigned int __percpu *kstat_irqs; irq_flow_handler_t handle_irq; struct irqaction *action; /* IRQ action list */ +#ifdef CONFIG_GENERIC_HARDIRQS_NO_COMPAT + unsigned int status_use_accessors; +#else unsigned int status; /* IRQ status */ +#endif unsigned int core_internal_state__do_not_mess_with_it; unsigned int depth; /* nested irq disables */ unsigned int wake_depth; /* nested wake enables */ @@ -164,6 +168,7 @@ static inline int irq_has_action(unsigned int irq) return desc->action != NULL; } +#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT static inline int irq_balancing_disabled(unsigned int irq) { struct irq_desc *desc; @@ -171,6 +176,7 @@ static inline int irq_balancing_disabled(unsigned int irq) desc = irq_to_desc(irq); return desc->status & IRQ_NO_BALANCING_MASK; } +#endif /* caller has locked the irq_desc and both params are valid */ static inline void __set_irq_handler_unlocked(int irq, diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 5e2366da9f38..fd5777ab2d34 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -15,6 +15,10 @@ #define istate core_internal_state__do_not_mess_with_it +#ifdef CONFIG_GENERIC_HARDIRQS_NO_COMPAT +# define status status_use_accessors +#endif + extern int noirqdebug; /* diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 55ebe1e09da4..0227ad358272 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -134,4 +134,5 @@ static inline bool irq_settings_is_nested_thread(struct irq_desc *desc) } /* Nothing should touch desc->status from now on */ +#undef status #define status USE_THE_PROPER_WRAPPERS_YOU_MORON From 091738a266fc74329ae186f22ff2b3f01319112d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 14 Feb 2011 20:16:43 +0100 Subject: [PATCH 386/706] genirq: Remove real old transition functions These transition helpers are stale for years now. Remove them. Signed-off-by: Thomas Gleixner --- kernel/irq/autoprobe.c | 6 ------ kernel/irq/chip.c | 16 ++++------------ kernel/irq/internals.h | 3 --- kernel/irq/manage.c | 14 +------------- 4 files changed, 5 insertions(+), 34 deletions(-) diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index c8bbc4fabaab..394784c57060 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -46,12 +46,6 @@ unsigned long probe_irq_on(void) for_each_irq_desc_reverse(i, desc) { raw_spin_lock_irq(&desc->lock); if (!desc->action && irq_settings_can_probe(desc)) { - /* - * An old-style architecture might still have - * the handle_bad_irq handler there: - */ - compat_irq_chip_set_default_handler(desc); - /* * Some chips need to know about probing in * progress: diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 2b0f9192a830..c19c0b562c80 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -644,19 +644,11 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, return; } - if (!handle) + if (!handle) { handle = handle_bad_irq; - else if (desc->irq_data.chip == &no_irq_chip) { - printk(KERN_WARNING "Trying to install %sinterrupt handler " - "for IRQ%d\n", is_chained ? "chained " : "", irq); - /* - * Some ARM implementations install a handler for really dumb - * interrupt hardware without setting an irq_chip. This worked - * with the ARM no_irq_chip but the check in setup_irq would - * prevent us to setup the interrupt at all. Switch it to - * dummy_irq_chip for easy transition. - */ - desc->irq_data.chip = &dummy_irq_chip; + } else { + if (WARN_ON(desc->irq_data.chip == &no_irq_chip)) + return; } chip_bus_lock(desc); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index fd5777ab2d34..f80a77471617 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -74,9 +74,6 @@ enum { /* Set default functions for irq_chip structures: */ extern void irq_chip_set_defaults(struct irq_chip *chip); -/* Set default handler: */ -extern void compat_irq_chip_set_default_handler(struct irq_desc *desc); - extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, unsigned long flags); extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ea6add6036b1..99395a24f432 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -540,17 +540,6 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) return !action; } -void compat_irq_chip_set_default_handler(struct irq_desc *desc) -{ - /* - * If the architecture still has not overriden - * the flow handler then zap the default. This - * should catch incorrect flow-type setting. - */ - if (desc->handle_irq == &handle_bad_irq) - desc->handle_irq = NULL; -} - int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, unsigned long flags) { @@ -912,8 +901,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (ret) goto out_mask; - } else - compat_irq_chip_set_default_handler(desc); + } desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ IRQS_INPROGRESS | IRQS_ONESHOT | \ From d5eb4ad2dfb2dfae43fd51bc8630b4fc3ef00e92 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 12 Feb 2011 12:16:16 +0100 Subject: [PATCH 387/706] genirq: Implement irq_get/put_desc_[bus]locked/unlock() Most of the managing functions get the irq descriptor and lock it - either with or without buslock. Instead of open coding this over and over provide a common function to do that. Signed-off-by: Thomas Gleixner --- kernel/irq/internals.h | 28 ++++++++++++++++++++++++++++ kernel/irq/irqdesc.c | 20 ++++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index f80a77471617..935bec4bfa87 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -126,6 +126,34 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc) desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); } +struct irq_desc * +__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus); +void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus); + +static inline struct irq_desc * +irq_get_desc_buslock(unsigned int irq, unsigned long *flags) +{ + return __irq_get_desc_lock(irq, flags, true); +} + +static inline void +irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags) +{ + __irq_put_desc_unlock(desc, flags, true); +} + +static inline struct irq_desc * +irq_get_desc_lock(unsigned int irq, unsigned long *flags) +{ + return __irq_get_desc_lock(irq, flags, false); +} + +static inline void +irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags) +{ + __irq_put_desc_unlock(desc, flags, false); +} + /* * Manipulation functions for irq_data.state */ diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 3387fbd7f2f4..394ab6a6c62c 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -402,6 +402,26 @@ unsigned int irq_get_next_irq(unsigned int offset) return find_next_bit(allocated_irqs, nr_irqs, offset); } +struct irq_desc * +__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (desc) { + if (bus) + chip_bus_lock(desc); + raw_spin_lock_irqsave(&desc->lock, *flags); + } + return desc; +} + +void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus) +{ + raw_spin_unlock_irqrestore(&desc->lock, flags); + if (bus) + chip_bus_sync_unlock(desc); +} + /** * dynamic_irq_cleanup - cleanup a dynamically allocated irq * @irq: irq number to initialize From 02725e7471b8dd58fa96f6604bdb5dde45405a2e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 12 Feb 2011 10:37:36 +0100 Subject: [PATCH 388/706] genirq: Use irq_get/put functions Convert the management functions to use the common irq_get/put function. Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 93 +++++++++++++-------------------------------- kernel/irq/manage.c | 81 +++++++++++++++------------------------ 2 files changed, 57 insertions(+), 117 deletions(-) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c19c0b562c80..3ea6aecd99c0 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -25,22 +25,18 @@ */ int irq_set_chip(unsigned int irq, struct irq_chip *chip) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); - if (!desc) { - WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq); + if (!desc) return -EINVAL; - } if (!chip) chip = &no_irq_chip; - raw_spin_lock_irqsave(&desc->lock, flags); irq_chip_set_defaults(chip); desc->irq_data.chip = chip; - raw_spin_unlock_irqrestore(&desc->lock, flags); - + irq_put_desc_unlock(desc, flags); return 0; } EXPORT_SYMBOL(irq_set_chip); @@ -52,24 +48,17 @@ EXPORT_SYMBOL(irq_set_chip); */ int irq_set_irq_type(unsigned int irq, unsigned int type) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - int ret = -ENXIO; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); + int ret = 0; - if (!desc) { - printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq); - return -ENODEV; - } + if (!desc) + return -EINVAL; type &= IRQ_TYPE_SENSE_MASK; - if (type == IRQ_TYPE_NONE) - return 0; - - chip_bus_lock(desc); - raw_spin_lock_irqsave(&desc->lock, flags); - ret = __irq_set_trigger(desc, irq, type); - raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(desc); + if (type != IRQ_TYPE_NONE) + ret = __irq_set_trigger(desc, irq, type); + irq_put_desc_busunlock(desc, flags); return ret; } EXPORT_SYMBOL(irq_set_irq_type); @@ -83,18 +72,13 @@ EXPORT_SYMBOL(irq_set_irq_type); */ int irq_set_handler_data(unsigned int irq, void *data) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); - if (!desc) { - printk(KERN_ERR - "Trying to install controller data for IRQ%d\n", irq); + if (!desc) return -EINVAL; - } - - raw_spin_lock_irqsave(&desc->lock, flags); desc->irq_data.handler_data = data; - raw_spin_unlock_irqrestore(&desc->lock, flags); + irq_put_desc_unlock(desc, flags); return 0; } EXPORT_SYMBOL(irq_set_handler_data); @@ -108,20 +92,15 @@ EXPORT_SYMBOL(irq_set_handler_data); */ int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); - if (!desc) { - printk(KERN_ERR - "Trying to install msi data for IRQ%d\n", irq); + if (!desc) return -EINVAL; - } - - raw_spin_lock_irqsave(&desc->lock, flags); desc->irq_data.msi_desc = entry; if (entry) entry->irq = irq; - raw_spin_unlock_irqrestore(&desc->lock, flags); + irq_put_desc_unlock(desc, flags); return 0; } @@ -134,24 +113,13 @@ int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) */ int irq_set_chip_data(unsigned int irq, void *data) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); - if (!desc) { - printk(KERN_ERR - "Trying to install chip data for IRQ%d\n", irq); + if (!desc) return -EINVAL; - } - - if (!desc->irq_data.chip) { - printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); - return -EINVAL; - } - - raw_spin_lock_irqsave(&desc->lock, flags); desc->irq_data.chip_data = data; - raw_spin_unlock_irqrestore(&desc->lock, flags); - + irq_put_desc_unlock(desc, flags); return 0; } EXPORT_SYMBOL(irq_set_chip_data); @@ -635,25 +603,19 @@ void __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, const char *name) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); - if (!desc) { - printk(KERN_ERR - "Trying to install type control for IRQ%d\n", irq); + if (!desc) return; - } if (!handle) { handle = handle_bad_irq; } else { if (WARN_ON(desc->irq_data.chip == &no_irq_chip)) - return; + goto out; } - chip_bus_lock(desc); - raw_spin_lock_irqsave(&desc->lock, flags); - /* Uninstall? */ if (handle == handle_bad_irq) { if (desc->irq_data.chip != &no_irq_chip) @@ -670,8 +632,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, irq_settings_set_norequest(desc); irq_startup(desc); } - raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(desc); +out: + irq_put_desc_busunlock(desc, flags); } EXPORT_SYMBOL_GPL(__set_irq_handler); @@ -693,14 +655,11 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); if (!desc) return; - - raw_spin_lock_irqsave(&desc->lock, flags); - irq_settings_clr_and_set(desc, clr, set); irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | @@ -714,5 +673,5 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc)); - raw_spin_unlock_irqrestore(&desc->lock, flags); + irq_put_desc_unlock(desc, flags); } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 99395a24f432..6cca1956c503 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -172,16 +172,13 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask) int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); if (!desc) return -EINVAL; - - raw_spin_lock_irqsave(&desc->lock, flags); desc->affinity_hint = m; - raw_spin_unlock_irqrestore(&desc->lock, flags); - + irq_put_desc_unlock(desc, flags); return 0; } EXPORT_SYMBOL_GPL(irq_set_affinity_hint); @@ -336,6 +333,18 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) irq_disable(desc); } +static int __disable_irq_nosync(unsigned int irq) +{ + unsigned long flags; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); + + if (!desc) + return -EINVAL; + __disable_irq(desc, irq, false); + irq_put_desc_busunlock(desc, flags); + return 0; +} + /** * disable_irq_nosync - disable an irq without waiting * @irq: Interrupt to disable @@ -349,17 +358,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) */ void disable_irq_nosync(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - - if (!desc) - return; - - chip_bus_lock(desc); - raw_spin_lock_irqsave(&desc->lock, flags); - __disable_irq(desc, irq, false); - raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(desc); + __disable_irq_nosync(irq); } EXPORT_SYMBOL(disable_irq_nosync); @@ -377,13 +376,7 @@ EXPORT_SYMBOL(disable_irq_nosync); */ void disable_irq(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); - - if (!desc) - return; - - disable_irq_nosync(irq); - if (desc->action) + if (!__disable_irq_nosync(irq)) synchronize_irq(irq); } EXPORT_SYMBOL(disable_irq); @@ -434,21 +427,18 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) */ void enable_irq(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); if (!desc) return; - if (WARN(!desc->irq_data.chip, KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) - return; + goto out; - chip_bus_lock(desc); - raw_spin_lock_irqsave(&desc->lock, flags); __enable_irq(desc, irq, false); - raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(desc); +out: + irq_put_desc_busunlock(desc, flags); } EXPORT_SYMBOL(enable_irq); @@ -477,15 +467,13 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) */ int irq_set_irq_wake(unsigned int irq, unsigned int on) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); int ret = 0; /* wakeup-capable irqs can be shared between drivers that * don't need to have the same sleep mode behaviors. */ - chip_bus_lock(desc); - raw_spin_lock_irqsave(&desc->lock, flags); if (on) { if (desc->wake_depth++ == 0) { ret = set_irq_wake_real(irq, on); @@ -505,9 +493,7 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on) irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE); } } - - raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(desc); + irq_put_desc_busunlock(desc, flags); return ret; } EXPORT_SYMBOL(irq_set_irq_wake); @@ -519,25 +505,20 @@ EXPORT_SYMBOL(irq_set_irq_wake); */ int can_request_irq(unsigned int irq, unsigned long irqflags) { - struct irq_desc *desc = irq_to_desc(irq); - struct irqaction *action; unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); + int canrequest = 0; if (!desc) return 0; - if (!irq_settings_can_request(desc)) - return 0; - - raw_spin_lock_irqsave(&desc->lock, flags); - action = desc->action; - if (action) - if (irqflags & action->flags & IRQF_SHARED) - action = NULL; - - raw_spin_unlock_irqrestore(&desc->lock, flags); - - return !action; + if (irq_settings_can_request(desc)) { + if (desc->action) + if (irqflags & desc->action->flags & IRQF_SHARED) + canrequest =1; + } + irq_put_desc_unlock(desc, flags); + return canrequest; } int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, From 3836ca08aad4575c120ccf328652f3873eea9063 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 14 Feb 2011 20:09:19 +0100 Subject: [PATCH 389/706] genirq: Consolidate set_chip_handler functions No need to have separate functions if we have one plus inline wrappers. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 51 ++++++++++++++++++++++++++++++++++----------- kernel/irq/chip.c | 16 ++++---------- 2 files changed, 43 insertions(+), 24 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index c101ad4b821f..3e29e2f42e04 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -398,23 +398,23 @@ extern struct irq_chip no_irq_chip; extern struct irq_chip dummy_irq_chip; extern void -set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, - irq_flow_handler_t handle); -extern void -set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, +irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, irq_flow_handler_t handle, const char *name); +static inline void irq_set_chip_and_handler(unsigned int irq, struct irq_chip *chip, + irq_flow_handler_t handle) +{ + irq_set_chip_and_handler_name(irq, chip, handle, NULL); +} + extern void -__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, +__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, const char *name); -/* - * Set a highlevel flow handler for a given IRQ: - */ static inline void -set_irq_handler(unsigned int irq, irq_flow_handler_t handle) +irq_set_handler(unsigned int irq, irq_flow_handler_t handle) { - __set_irq_handler(irq, handle, 0, NULL); + __irq_set_handler(irq, handle, 0, NULL); } /* @@ -423,9 +423,9 @@ set_irq_handler(unsigned int irq, irq_flow_handler_t handle) * IRQ_NOREQUEST and IRQ_NOPROBE) */ static inline void -set_irq_chained_handler(unsigned int irq, irq_flow_handler_t handle) +irq_set_chained_handler(unsigned int irq, irq_flow_handler_t handle) { - __set_irq_handler(irq, handle, 1, NULL); + __irq_set_handler(irq, handle, 1, NULL); } void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set); @@ -579,6 +579,33 @@ static inline void set_irq_nested_thread(unsigned int irq, int nest) { irq_set_nested_thread(irq, nest); } +static inline void +set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, + irq_flow_handler_t handle, const char *name) +{ + irq_set_chip_and_handler_name(irq, chip, handle, name); +} +static inline void +set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, + irq_flow_handler_t handle) +{ + irq_set_chip_and_handler(irq, chip, handle); +} +static inline void +__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, + const char *name) +{ + __irq_set_handler(irq, handle, is_chained, name); +} +static inline void set_irq_handler(unsigned int irq, irq_flow_handler_t handle) +{ + irq_set_handler(irq, handle); +} +static inline void +set_irq_chained_handler(unsigned int irq, irq_flow_handler_t handle) +{ + irq_set_chained_handler(irq, handle); +} #endif int irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 3ea6aecd99c0..2a36038b8f59 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -600,7 +600,7 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) } void -__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, +__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, const char *name) { unsigned long flags; @@ -635,22 +635,14 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, out: irq_put_desc_busunlock(desc, flags); } -EXPORT_SYMBOL_GPL(__set_irq_handler); +EXPORT_SYMBOL_GPL(__irq_set_handler); void -set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, - irq_flow_handler_t handle) -{ - irq_set_chip(irq, chip); - __set_irq_handler(irq, handle, 0, NULL); -} - -void -set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, +irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, irq_flow_handler_t handle, const char *name) { irq_set_chip(irq, chip); - __set_irq_handler(irq, handle, 0, name); + __irq_set_handler(irq, handle, 0, name); } void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) From 781295762defc709a609efc01d8bb065276cd9a2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Feb 2011 15:14:20 +0100 Subject: [PATCH 390/706] genirq: Add preflow handler support sparc64 needs to call a preflow handler on certain interrupts befor calling the action chain. Integrate it into handle_fasteoi_irq. Must be enabled via CONFIG_IRQ_FASTEOI_PREFLOW. No impact when disabled. Signed-off-by: Thomas Gleixner Cc: David S. Miller --- include/linux/irq.h | 3 ++- include/linux/irqdesc.h | 14 ++++++++++++++ kernel/irq/Kconfig | 3 +++ kernel/irq/chip.c | 11 +++++++++++ 4 files changed, 30 insertions(+), 1 deletion(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index 3e29e2f42e04..36390970693c 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -29,9 +29,10 @@ #include struct irq_desc; +struct irq_data; typedef void (*irq_flow_handler_t)(unsigned int irq, struct irq_desc *desc); - +typedef void (*irq_preflow_handler_t)(struct irq_data *data); /* * IRQ line status. diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 36c95f08023d..2f87d6441302 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -63,6 +63,9 @@ struct irq_desc { struct timer_rand_state *timer_rand_state; unsigned int __percpu *kstat_irqs; irq_flow_handler_t handle_irq; +#ifdef CONFIG_IRQ_PREFLOW_FASTEOI + irq_preflow_handler_t preflow_handler; +#endif struct irqaction *action; /* IRQ action list */ #ifdef CONFIG_GENERIC_HARDIRQS_NO_COMPAT unsigned int status_use_accessors; @@ -187,6 +190,17 @@ static inline void __set_irq_handler_unlocked(int irq, desc = irq_to_desc(irq); desc->handle_irq = handler; } + +#ifdef CONFIG_IRQ_PREFLOW_FASTEOI +static inline void +__irq_set_preflow_handler(unsigned int irq, irq_preflow_handler_t handler) +{ + struct irq_desc *desc; + + desc = irq_to_desc(irq); + desc->preflow_handler = handler; +} +#endif #endif #endif diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 48ad25f5fa59..9149be729e45 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -35,6 +35,9 @@ config AUTO_IRQ_AFFINITY config HARDIRQS_SW_RESEND def_bool n +config IRQ_PREFLOW_FASTEOI + def_bool n + config SPARSE_IRQ bool "Support sparse irq numbering" depends on HAVE_SPARSE_IRQ diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 2a36038b8f59..08be5d182be3 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -471,6 +471,16 @@ out_unlock: } EXPORT_SYMBOL_GPL(handle_level_irq); +#ifdef CONFIG_IRQ_PREFLOW_FASTEOI +static inline void preflow_handler(struct irq_desc *desc) +{ + if (desc->preflow_handler) + desc->preflow_handler(&desc->irq_data); +} +#else +static inline void preflow_handler(struct irq_desc *desc) { } +#endif + /** * handle_fasteoi_irq - irq handler for transparent controllers * @irq: the interrupt number @@ -503,6 +513,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) mask_irq(desc); goto out; } + preflow_handler(desc); handle_irq_event(desc); out: desc->irq_data.chip->irq_eoi(&desc->irq_data); From 77694b408abb8f92195ad5ed6ce5492f1d794c77 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 15 Feb 2011 10:33:57 +0100 Subject: [PATCH 391/706] genirq; Add fasteoi irq_chip quirk Some chips want irq_eoi() only called when an interrupt is actually handled. So they have checks for INPROGRESS and DISABLED in their irq_eoi callbacks. Add a chip flag, which allows to handle that in the generic code. No impact on the fastpath. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 4 +++- kernel/irq/chip.c | 9 ++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index 36390970693c..ea2970c294aa 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -328,10 +328,12 @@ struct irq_chip { /* * irq_chip specific flags * - * IRQCHIP_SET_TYPE_MASKED: Mask before calling chip.irq_set_type() + * IRQCHIP_SET_TYPE_MASKED: Mask before calling chip.irq_set_type() + * IRQCHIP_EOI_IF_HANDLED: Only issue irq_eoi() when irq was handled */ enum { IRQCHIP_SET_TYPE_MASKED = (1 << 0), + IRQCHIP_EOI_IF_HANDLED = (1 << 1), }; /* This include will go away once we isolated irq_desc usage to core code */ diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 08be5d182be3..1d3e25e68b0c 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -515,9 +515,16 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) } preflow_handler(desc); handle_irq_event(desc); -out: + +out_eoi: desc->irq_data.chip->irq_eoi(&desc->irq_data); +out_unlock: raw_spin_unlock(&desc->lock); + return; +out: + if (!(desc->irq_data.chip->flags & IRQCHIP_EOI_IF_HANDLED)) + goto out_eoi; + goto out_unlock; } /** From a439520f8b18917b322f576be04c54aba84bb044 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 4 Feb 2011 18:46:16 +0100 Subject: [PATCH 392/706] genirq: Implement irq_data based move_*_irq() versions No need to lookup the irq descriptor when calling from a chip callback function which has irq_data already handy. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 4 ++++ kernel/irq/migration.c | 28 +++++++++++++++++++--------- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index ea2970c294aa..ff62d0145b8f 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -363,9 +363,13 @@ extern void remove_irq(unsigned int irq, struct irqaction *act); #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ) void move_native_irq(int irq); void move_masked_irq(int irq); +void irq_move_irq(struct irq_data *data); +void irq_move_masked_irq(struct irq_data *data); #else static inline void move_native_irq(int irq) { } static inline void move_masked_irq(int irq) { } +static inline void irq_move_irq(struct irq_data *data) { } +static inline void irq_move_masked_irq(struct irq_data *data) { } #endif extern int no_irq_affinity; diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 7a93c6b88b25..ec4806d4778b 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -4,10 +4,10 @@ #include "internals.h" -void move_masked_irq(int irq) +void irq_move_masked_irq(struct irq_data *idata) { - struct irq_desc *desc = irq_to_desc(irq); - struct irq_chip *chip = desc->irq_data.chip; + struct irq_desc *desc = irq_data_to_desc(idata); + struct irq_chip *chip = idata->chip; if (likely(!irqd_is_setaffinity_pending(&desc->irq_data))) return; @@ -53,12 +53,17 @@ void move_masked_irq(int irq) cpumask_clear(desc->pending_mask); } -void move_native_irq(int irq) +void move_masked_irq(int irq) { - struct irq_desc *desc = irq_to_desc(irq); + irq_move_masked_irq(irq_get_irq_data(irq)); +} + +void irq_move_irq(struct irq_data *idata) +{ + struct irq_desc *desc = irq_data_to_desc(idata); bool masked; - if (likely(!irqd_is_setaffinity_pending(&desc->irq_data))) + if (likely(!irqd_is_setaffinity_pending(idata))) return; if (unlikely(desc->istate & IRQS_DISABLED)) @@ -71,8 +76,13 @@ void move_native_irq(int irq) */ masked = desc->istate & IRQS_MASKED; if (!masked) - desc->irq_data.chip->irq_mask(&desc->irq_data); - move_masked_irq(irq); + idata->chip->irq_mask(idata); + irq_move_masked_irq(idata); if (!masked) - desc->irq_data.chip->irq_unmask(&desc->irq_data); + idata->chip->irq_unmask(idata); +} + +void move_native_irq(int irq) +{ + irq_move_irq(irq_get_irq_data(irq)); } From 2b15cd96e5e93f9aa4f7041c91c1e7c344a62cbb Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 18 Feb 2011 16:47:36 +0100 Subject: [PATCH 393/706] x86, system.h: Drop unused __SAVE/__RESTORE macros Those are unused since at least the beginning of git history. Signed-off-by: Borislav Petkov LKML-Reference: <1298044056-31104-1-git-send-email-bp@amd64.org> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/system.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index 33ecc3ea8782..12569e691ce3 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -98,8 +98,6 @@ do { \ */ #define HAVE_DISABLE_HLT #else -#define __SAVE(reg, offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t" -#define __RESTORE(reg, offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t" /* frame pointer must be last for get_wchan */ #define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" From 1396fa9cd2e34669253b7ca8c75f12103481f71c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 18 Feb 2011 12:17:16 +0300 Subject: [PATCH 394/706] x86, microcode, AMD: Fix signedness bug in generic_load_microcode() install_equiv_cpu_table() returns type int. It uses negative error codes so using an unsigned type breaks the error handling. Signed-off-by: Dan Carpenter Acked-by: Borislav Petkov Cc: open list:AMD MICROCODE UPD... Cc: Andreas Herrmann LKML-Reference: <20110218091716.GA4384@bicker> Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 9fb8405451c5..c5610384ab16 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -246,7 +246,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) struct ucode_cpu_info *uci = ucode_cpu_info + cpu; struct microcode_header_amd *mc_hdr = NULL; unsigned int mc_size, leftover; - unsigned long offset; + int offset; const u8 *ucode_ptr = data; void *new_mc = NULL; unsigned int new_rev = uci->cpu_sig.rev; From 69efcc6d90d234a3a076afb2c635c1609536faa4 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 21 Feb 2011 10:58:13 +0100 Subject: [PATCH 395/706] x86-64, NUMA: Do not scan two times for setup_node_bootmem() By the time setup_node_bootmem() is called, all the memblocks are already registered. As node_data is allocated from these memblocks, calling it more than once doesn't make any difference. Drop the loop. tj: Dropped comment referencing to the old behavior as suggested by David and rephrased the description. Signed-off-by: Yinghai Lu Acked-by: David Rientjes Signed-off-by: Tejun Heo --- arch/x86/mm/numa_64.c | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index f6d85e380471..6e4ee96d1b11 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -480,7 +480,7 @@ static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) static int __init numa_register_memblks(struct numa_meminfo *mi) { - int i, j, nid; + int i, nid; /* Account for nodes with cpus and no memory */ node_possible_map = numa_nodes_parsed; @@ -506,28 +506,20 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) init_memory_mapping_high(); - /* - * Finally register nodes. Do it twice in case setup_node_bootmem - * missed one due to missing bootmem. - */ - for (i = 0; i < 2; i++) { - for_each_node_mask(nid, node_possible_map) { - u64 start = (u64)max_pfn << PAGE_SHIFT; - u64 end = 0; + /* Finally register nodes. */ + for_each_node_mask(nid, node_possible_map) { + u64 start = (u64)max_pfn << PAGE_SHIFT; + u64 end = 0; - if (node_online(nid)) + for (i = 0; i < mi->nr_blks; i++) { + if (nid != mi->blk[i].nid) continue; - - for (j = 0; j < mi->nr_blks; j++) { - if (nid != mi->blk[j].nid) - continue; - start = min(mi->blk[j].start, start); - end = max(mi->blk[j].end, end); - } - - if (start < end) - setup_node_bootmem(nid, start, end); + start = min(mi->blk[i].start, start); + end = max(mi->blk[i].end, end); } + + if (start < end) + setup_node_bootmem(nid, start, end); } return 0; From a61d825808a0ce9935afebc225dcd602d5339e14 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 21 Feb 2011 12:54:34 +0100 Subject: [PATCH 396/706] genirq: Fix misplaced status update in irq_disable() We lazy disable interrupt lines, so only mark the line masked, when the chip provides an irq_disable callback. Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 1d3e25e68b0c..b5145654855f 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -199,8 +199,8 @@ void irq_disable(struct irq_desc *desc) irq_state_set_disabled(desc); if (desc->irq_data.chip->irq_disable) { desc->irq_data.chip->irq_disable(&desc->irq_data); + irq_state_set_masked(desc); } - irq_state_set_masked(desc); } #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED From ed4dea6e0e33a3e58d8b77b775a8f0e433e7a005 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 19 Feb 2011 11:07:37 -0800 Subject: [PATCH 397/706] genirq: Use IRQ_BITMAP_BITS as search size in irq_alloc_descs() The runtime expansion of nr_irqs does not take into account that bitmap_find_next_zero_area() returns "start" + size in case the search for an matching zero area fails. That results in a start value which can be completely off and is not covered by the following expand_nr_irqs() and possibly outside of the absolute limit. But we use it without further checking. Use IRQ_BITMAP_BITS as the limit for the bitmap search and expand nr_irqs when the start bit is beyond nr_irqs. So start is always pointing to the correct area in the bitmap. nr_irqs is just the limit for irq enumerations, not the real limit for the irq space. [ tglx: Let irq_expand_nr_irqs() take the new upper end so we do not expand nr_irqs more than necessary. Made changelog readable ] Signed-off-by: Yinghai Lu LKML-Reference: <4D6014F9.8040605@kernel.org> Signed-off-by: Thomas Gleixner --- kernel/irq/irqdesc.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 394ab6a6c62c..dbccc799407f 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -207,11 +207,11 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) return NULL; } -static int irq_expand_nr_irqs(unsigned int cnt) +static int irq_expand_nr_irqs(unsigned int nr) { - if (nr_irqs + cnt > IRQ_BITMAP_BITS) + if (nr > IRQ_BITMAP_BITS) return -ENOMEM; - nr_irqs += cnt; + nr_irqs = nr; return 0; } @@ -298,7 +298,7 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) return start; } -static int irq_expand_nr_irqs(unsigned int cnt) +static int irq_expand_nr_irqs(unsigned int nr) { return -ENOMEM; } @@ -346,13 +346,14 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node) mutex_lock(&sparse_irq_lock); - start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); + start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS, + from, cnt, 0); ret = -EEXIST; if (irq >=0 && start != irq) goto err; - if (start >= nr_irqs) { - ret = irq_expand_nr_irqs(cnt); + if (start + cnt > nr_irqs) { + ret = irq_expand_nr_irqs(start + cnt); if (ret) goto err; } From 8fff39e06987492da3d4a0b9ec7cdbd245b6762b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 21 Feb 2011 14:19:42 +0100 Subject: [PATCH 398/706] genirq: Add missing break in __irq_set_trigger() The switch case in __irq_set_trigger() lacks a break, which emits a pr_err unconditionally on success. Reported-by: Lars-Peter Clausen Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 6cca1956c503..01f8a9519e63 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -567,6 +567,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, if (chip != desc->irq_data.chip) irq_chip_set_defaults(desc->irq_data.chip); ret = 0; + break; default: pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", flags, irq, chip->irq_set_type); From e06383db9ec591696a06654257474b85bac1f8cb Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 14 Dec 2010 19:37:07 -0800 Subject: [PATCH 399/706] hrtimers: extend hrtimer base code to handle more then 2 clockids MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hrtimer code is written mainly with CLOCK_REALTIME and CLOCK_MONOTONIC in mind. These are clockids 0 and 1 resepctively. However, if we are to introduce any new hrtimer bases, using new clockids, we have to skip the cputimers (clockids 2,3) as well as other clockids that may not impelement timers. This patch adds a little bit of indirection between the clockid and the base, so that we can extend the base by one when we add a new clockid at number 7 or so. CC: Jamie Lokier CC: Thomas Gleixner CC: Alexander Shishkin CC: Arve Hjønnevåg Signed-off-by: John Stultz --- include/linux/hrtimer.h | 6 +++++- kernel/hrtimer.c | 40 +++++++++++++++++++++++++++------------- 2 files changed, 32 insertions(+), 14 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index f376ddc64c4d..20b8e6601a04 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -148,7 +148,11 @@ struct hrtimer_clock_base { #endif }; -#define HRTIMER_MAX_CLOCK_BASES 2 +enum hrtimer_base_type { + HRTIMER_BASE_REALTIME, + HRTIMER_BASE_MONOTONIC, + HRTIMER_MAX_CLOCK_BASES, +}; /* * struct hrtimer_cpu_base - the per cpu clock bases diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 57c4d33c9a9d..ca99e2454600 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -53,11 +53,10 @@ /* * The timer bases: * - * Note: If we want to add new timer bases, we have to skip the two - * clock ids captured by the cpu-timers. We do this by holding empty - * entries rather than doing math adjustment of the clock ids. - * This ensures that we capture erroneous accesses to these clock ids - * rather than moving them into the range of valid clock id's. + * There are more clockids then hrtimer bases. Thus, we index + * into the timer bases by the hrtimer_base_type enum. When trying + * to reach a base using a clockid, hrtimer_clockid_to_base() + * is used to convert from clockid to the proper hrtimer_base_type. */ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { @@ -77,6 +76,14 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = } }; +static int hrtimer_clock_to_base_table[MAX_CLOCKS]; + +static inline int hrtimer_clockid_to_base(clockid_t clock_id) +{ + return hrtimer_clock_to_base_table[clock_id]; +} + + /* * Get the coarse grained time at the softirq based on xtime and * wall_to_monotonic. @@ -90,8 +97,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) xtim = timespec_to_ktime(xts); tomono = timespec_to_ktime(tom); - base->clock_base[CLOCK_REALTIME].softirq_time = xtim; - base->clock_base[CLOCK_MONOTONIC].softirq_time = + base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; + base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = ktime_add(xtim, tomono); } @@ -179,10 +186,11 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, struct hrtimer_cpu_base *new_cpu_base; int this_cpu = smp_processor_id(); int cpu = hrtimer_get_target(this_cpu, pinned); + int basenum = hrtimer_clockid_to_base(base->index); again: new_cpu_base = &per_cpu(hrtimer_bases, cpu); - new_base = &new_cpu_base->clock_base[base->index]; + new_base = &new_cpu_base->clock_base[basenum]; if (base != new_base) { /* @@ -618,7 +626,7 @@ static void retrigger_next_event(void *arg) /* Adjust CLOCK_REALTIME offset */ raw_spin_lock(&base->lock); - base->clock_base[CLOCK_REALTIME].offset = + base->clock_base[HRTIMER_BASE_REALTIME].offset = timespec_to_ktime(realtime_offset); hrtimer_force_reprogram(base, 0); @@ -716,8 +724,8 @@ static int hrtimer_switch_to_hres(void) return 0; } base->hres_active = 1; - base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES; - base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES; + base->clock_base[HRTIMER_BASE_REALTIME].resolution = KTIME_HIGH_RES; + base->clock_base[HRTIMER_BASE_MONOTONIC].resolution = KTIME_HIGH_RES; tick_setup_sched_timer(); @@ -1112,6 +1120,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { struct hrtimer_cpu_base *cpu_base; + int base; memset(timer, 0, sizeof(struct hrtimer)); @@ -1120,7 +1129,8 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) clock_id = CLOCK_MONOTONIC; - timer->base = &cpu_base->clock_base[clock_id]; + base = hrtimer_clockid_to_base(clock_id); + timer->base = &cpu_base->clock_base[base]; hrtimer_init_timer_hres(timer); timerqueue_init(&timer->node); @@ -1156,9 +1166,10 @@ EXPORT_SYMBOL_GPL(hrtimer_init); int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) { struct hrtimer_cpu_base *cpu_base; + int base = hrtimer_clockid_to_base(which_clock); cpu_base = &__raw_get_cpu_var(hrtimer_bases); - *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution); + *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution); return 0; } @@ -1705,6 +1716,9 @@ static struct notifier_block __cpuinitdata hrtimers_nb = { void __init hrtimers_init(void) { + hrtimer_clock_to_base_table[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME; + hrtimer_clock_to_base_table[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC; + hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); register_cpu_notifier(&hrtimers_nb); From abb3a4ea2e0ea7114a4475745da2f32bd9ad5b73 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 14 Feb 2011 17:52:09 -0800 Subject: [PATCH 400/706] time: Introduce get_monotonic_boottime and ktime_get_boottime MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds new functions that return the monotonic time since boot (in other words, CLOCK_MONOTONIC + suspend time). CC: Jamie Lokier CC: Thomas Gleixner CC: Alexander Shishkin CC: Arve Hjønnevåg Signed-off-by: John Stultz --- include/linux/hrtimer.h | 1 + include/linux/time.h | 1 + kernel/time/timekeeping.c | 51 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 52 insertions(+), 1 deletion(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 20b8e6601a04..7a9e7ee0f35c 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -312,6 +312,7 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer) extern ktime_t ktime_get(void); extern ktime_t ktime_get_real(void); +extern ktime_t ktime_get_boottime(void); DECLARE_PER_CPU(struct tick_device, tick_cpu_device); diff --git a/include/linux/time.h b/include/linux/time.h index 379b9037b5b4..fa39150cb816 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -161,6 +161,7 @@ extern void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real); extern void getboottime(struct timespec *ts); extern void monotonic_to_bootbased(struct timespec *ts); +extern void get_monotonic_boottime(struct timespec *ts); extern struct timespec timespec_trunc(struct timespec t, unsigned gran); extern int timekeeping_valid_for_hres(void); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6262c1d18397..5fbd9aa7df95 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -907,7 +907,7 @@ static void update_wall_time(void) * getboottime - Return the real time of system boot. * @ts: pointer to the timespec to be set * - * Returns the time of day in a timespec. + * Returns the wall-time of boot in a timespec. * * This is based on the wall_to_monotonic offset and the total suspend * time. Calls to settimeofday will affect the value returned (which @@ -925,6 +925,55 @@ void getboottime(struct timespec *ts) } EXPORT_SYMBOL_GPL(getboottime); + +/** + * get_monotonic_boottime - Returns monotonic time since boot + * @ts: pointer to the timespec to be set + * + * Returns the monotonic time since boot in a timespec. + * + * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also + * includes the time spent in suspend. + */ +void get_monotonic_boottime(struct timespec *ts) +{ + struct timespec tomono, sleep; + unsigned int seq; + s64 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqbegin(&xtime_lock); + *ts = xtime; + tomono = wall_to_monotonic; + sleep = total_sleep_time; + nsecs = timekeeping_get_ns(); + + } while (read_seqretry(&xtime_lock, seq)); + + set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, + ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs); +} +EXPORT_SYMBOL_GPL(get_monotonic_boottime); + +/** + * ktime_get_boottime - Returns monotonic time since boot in a ktime + * + * Returns the monotonic time since boot in a ktime + * + * This is similar to CLOCK_MONTONIC/ktime_get, but also + * includes the time spent in suspend. + */ +ktime_t ktime_get_boottime(void) +{ + struct timespec ts; + + get_monotonic_boottime(&ts); + return timespec_to_ktime(ts); +} +EXPORT_SYMBOL_GPL(ktime_get_boottime); + /** * monotonic_to_bootbased - Convert the monotonic time to boot based. * @ts: pointer to the timespec to be converted From 314ac37150011ebb398f522db528d2dbcc611189 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 14 Feb 2011 18:43:08 -0800 Subject: [PATCH 401/706] time: Extend get_xtime_and_monotonic_offset() to also return sleep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend get_xtime_and_monotonic_offset to get_xtime_and_monotonic_and_sleep_offset(). CC: Jamie Lokier CC: Thomas Gleixner CC: Alexander Shishkin CC: Arve Hjønnevåg Signed-off-by: John Stultz --- include/linux/time.h | 3 ++- kernel/hrtimer.c | 9 +++++---- kernel/time/timekeeping.c | 8 ++++++-- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/include/linux/time.h b/include/linux/time.h index fa39150cb816..02d48fb30b41 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -124,7 +124,8 @@ unsigned long get_seconds(void); struct timespec current_kernel_time(void); struct timespec __current_kernel_time(void); /* does not take xtime_lock */ struct timespec get_monotonic_coarse(void); -void get_xtime_and_monotonic_offset(struct timespec *xtim, struct timespec *wtom); +void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, + struct timespec *wtom, struct timespec *sleep); #define CURRENT_TIME (current_kernel_time()) #define CURRENT_TIME_SEC ((struct timespec) { get_seconds(), 0 }) diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index ca99e2454600..e8bf3ad99063 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -91,9 +91,9 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) { ktime_t xtim, tomono; - struct timespec xts, tom; + struct timespec xts, tom, slp; - get_xtime_and_monotonic_offset(&xts, &tom); + get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp); xtim = timespec_to_ktime(xts); tomono = timespec_to_ktime(tom); @@ -614,12 +614,13 @@ static int hrtimer_reprogram(struct hrtimer *timer, static void retrigger_next_event(void *arg) { struct hrtimer_cpu_base *base; - struct timespec realtime_offset, wtm; + struct timespec realtime_offset, wtm, sleep; if (!hrtimer_hres_active()) return; - get_xtime_and_monotonic_offset(&realtime_offset, &wtm); + get_xtime_and_monotonic_and_sleep_offset(&realtime_offset, &wtm, + &sleep); set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); base = &__get_cpu_var(hrtimer_bases); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5fbd9aa7df95..3bd7e3d5c632 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1040,11 +1040,14 @@ void do_timer(unsigned long ticks) } /** - * get_xtime_and_monotonic_offset() - get xtime and wall_to_monotonic + * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic, + * and sleep offsets. * @xtim: pointer to timespec to be set with xtime * @wtom: pointer to timespec to be set with wall_to_monotonic + * @sleep: pointer to timespec to be set with time in suspend */ -void get_xtime_and_monotonic_offset(struct timespec *xtim, struct timespec *wtom) +void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, + struct timespec *wtom, struct timespec *sleep) { unsigned long seq; @@ -1052,6 +1055,7 @@ void get_xtime_and_monotonic_offset(struct timespec *xtim, struct timespec *wtom seq = read_seqbegin(&xtime_lock); *xtim = xtime; *wtom = wall_to_monotonic; + *sleep = total_sleep_time; } while (read_seqretry(&xtime_lock, seq)); } From 70a08cca1227dc31c784ec930099a4417a06e7d0 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 15 Feb 2011 10:45:16 -0800 Subject: [PATCH 402/706] timers: Add CLOCK_BOOTTIME hrtimer base MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CLOCK_MONOTONIC stops while the system is in suspend. This is because to applications system suspend is invisible. However, there is a growing set of applications that are wanting to be suspend-aware, but do not want to deal with the complications of CLOCK_REALTIME (which might jump around if settimeofday is called). For these applications, I propose a new clockid: CLOCK_BOOTTIME. CLOCK_BOOTTIME is idential to CLOCK_MONOTONIC, except it also includes any time spent in suspend. This patch add hrtimer base for CLOCK_BOOTTIME, using get_monotonic_boottime/ktime_get_boottime, to allow in kernel users to set timers against. CC: Jamie Lokier CC: Thomas Gleixner CC: Alexander Shishkin CC: Arve Hjønnevåg Signed-off-by: John Stultz --- include/linux/hrtimer.h | 1 + include/linux/time.h | 1 + kernel/hrtimer.c | 16 ++++++++++++---- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 7a9e7ee0f35c..6bc1804bfbfa 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -151,6 +151,7 @@ struct hrtimer_clock_base { enum hrtimer_base_type { HRTIMER_BASE_REALTIME, HRTIMER_BASE_MONOTONIC, + HRTIMER_BASE_BOOTTIME, HRTIMER_MAX_CLOCK_BASES, }; diff --git a/include/linux/time.h b/include/linux/time.h index 02d48fb30b41..454a26205787 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -293,6 +293,7 @@ struct itimerval { #define CLOCK_MONOTONIC_RAW 4 #define CLOCK_REALTIME_COARSE 5 #define CLOCK_MONOTONIC_COARSE 6 +#define CLOCK_BOOTTIME 7 /* * The IDs of various hardware clocks: diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index e8bf3ad99063..4c53af18734b 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -73,6 +73,11 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .get_time = &ktime_get, .resolution = KTIME_LOW_RES, }, + { + .index = CLOCK_BOOTTIME, + .get_time = &ktime_get_boottime, + .resolution = KTIME_LOW_RES, + }, } }; @@ -90,16 +95,17 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) */ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) { - ktime_t xtim, tomono; + ktime_t xtim, mono, boot; struct timespec xts, tom, slp; get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp); xtim = timespec_to_ktime(xts); - tomono = timespec_to_ktime(tom); + mono = ktime_add(xtim, timespec_to_ktime(tom)); + boot = ktime_add(mono, timespec_to_ktime(slp)); base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; - base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = - ktime_add(xtim, tomono); + base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; + base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; } /* @@ -727,6 +733,7 @@ static int hrtimer_switch_to_hres(void) base->hres_active = 1; base->clock_base[HRTIMER_BASE_REALTIME].resolution = KTIME_HIGH_RES; base->clock_base[HRTIMER_BASE_MONOTONIC].resolution = KTIME_HIGH_RES; + base->clock_base[HRTIMER_BASE_BOOTTIME].resolution = KTIME_HIGH_RES; tick_setup_sched_timer(); @@ -1719,6 +1726,7 @@ void __init hrtimers_init(void) { hrtimer_clock_to_base_table[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME; hrtimer_clock_to_base_table[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC; + hrtimer_clock_to_base_table[CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME; hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); From 7fdd7f89006dd5a4c702fa0ce0c272345fa44ae0 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 15 Feb 2011 10:52:57 -0800 Subject: [PATCH 403/706] timers: Export CLOCK_BOOTTIME via the posix timers interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch exports CLOCK_BOOTTIME through the posix timers interface CC: Jamie Lokier CC: Thomas Gleixner CC: Alexander Shishkin CC: Arve Hjønnevåg Signed-off-by: John Stultz --- kernel/posix-timers.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 44fcff131b38..4c0124919f9a 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -187,7 +187,7 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp) } /* - * Get monotonic time for posix timers + * Get monotonic-raw time for posix timers */ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) { @@ -214,6 +214,14 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp *tp = ktime_to_timespec(KTIME_LOW_RES); return 0; } + +static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp) +{ + get_monotonic_boottime(tp); + return 0; +} + + /* * Initialize everything, well, just everything in Posix clocks/timers ;) */ @@ -253,12 +261,23 @@ static __init int init_posix_timers(void) .clock_getres = posix_get_coarse_res, .clock_get = posix_get_monotonic_coarse, }; + struct k_clock clock_boottime = { + .clock_getres = hrtimer_get_res, + .clock_get = posix_get_boottime, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + }; posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime); posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic); posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); + posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime); posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof (struct k_itimer), 0, SLAB_PANIC, From fbee632d0ca9f4073a3fefb9a843eac8af036b0f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 21 Feb 2011 13:23:57 -0300 Subject: [PATCH 404/706] perf probe: Fix error propagation leading to segfault There are two hunks in this patch that stops probe processing as soon as one error is found, breaking out of loops, the other fix an error propagation that should return a negative error number but instead was returning the result of "ret < 0", which is 1 and thus made several error checks fail because they test agains < 0. The problem could be triggered by asking for a variable that was optimized out, fact that should stop the whole probe processing but instead was segfaulting while installing broken probes: [root@emilia ~]# probe perf_mmap:55 user_lock_limit Failed to find the location of user_lock_limit at this address. Perhaps, it has been optimized out. Failed to find 'user_lock_limit' in this function. Add new events: probe:perf_mmap (on perf_mmap:55 with user_lock_limit) probe:perf_mmap_1 (on perf_mmap:55 with user_lock_limit) Segmentation fault (core dumped) [root@emilia ~]# perf probe -l probe:perf_mmap (on perf_mmap:55@git/linux/kernel/perf_event.c with user_lock_limit) probe:perf_mmap_1 (on perf_mmap:55@git/linux/kernel/perf_event.c with user_lock_limit) [root@emilia ~]# After the fix: [root@emilia ~]# probe perf_mmap:55 user_lock_limit Failed to find the location of user_lock_limit at this address. Perhaps, it has been optimized out. Failed to find 'user_lock_limit' in this function. Error: Failed to add events. (-2) [root@emilia ~]# Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-event.c | 5 ++++- tools/perf/util/probe-finder.c | 4 +++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 0e3ea1321103..369ddc64bbb6 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -1832,9 +1832,12 @@ int add_perf_probe_events(struct perf_probe_event *pevs, int npevs, } /* Loop 2: add all events */ - for (i = 0; i < npevs && ret >= 0; i++) + for (i = 0; i < npevs && ret >= 0; i++) { ret = __add_probe_trace_events(pkgs[i].pev, pkgs[i].tevs, pkgs[i].ntevs, force_add); + if (ret < 0) + break; + } end: /* Loop 3: cleanup and free trace events */ for (i = 0; i < npevs; i++) { diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index fe461f6559f1..eecbdca5c0d5 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -1262,7 +1262,7 @@ static int probe_point_line_walker(const char *fname, int lineno, ret = call_probe_finder(NULL, pf); /* Continue if no error, because the line will be in inline function */ - return ret < 0 ?: 0; + return ret < 0 ? ret : 0; } /* Find probe point from its line number */ @@ -1484,6 +1484,8 @@ static int find_probes(int fd, struct probe_finder *pf) pf->lno = pp->line; ret = find_probe_point_by_line(pf); } + if (ret != DWARF_CB_OK) + break; } off = noff; } From e603dc15072c7fec0ae263597e6dabc3bb4c5c5b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 21 Feb 2011 16:05:50 -0300 Subject: [PATCH 405/706] perf evsel: Fix inverted test for fixing up attr.inherit flag The kernel refuses mmapping an event with the inherit flag set for something that is systemwide (cpu == -1), and the evsel layer got this reversed at some point, fix it. The symtom was that the --pid and --tid parameters for 'perf record' and 'perf top' returned with -EINVAL, like: # /tmp/build-perf/perf record -v -fo/tmp/perf.data -p 1042 Warning: ... trying to fall back to cpu-clock-ticks Fatal: failed to mmap with 22 (Invalid argument) Reported-by: David Ahern Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 63cadaf3e208..8083d5126fca 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -179,8 +179,19 @@ static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus, for (cpu = 0; cpu < cpus->nr; cpu++) { int group_fd = -1; - - evsel->attr.inherit = (cpus->map[cpu] < 0) && inherit; + /* + * Don't allow mmap() of inherited per-task counters. This + * would create a performance issue due to all children writing + * to the same buffer. + * + * FIXME: + * Proper fix is not to pass 'inherit' to perf_evsel__open*, + * but a 'flags' parameter, with 'group' folded there as well, + * then introduce a PERF_O_{MMAP,GROUP,INHERIT} enum, and if + * O_MMAP is set, emit a warning if cpu < 0 and O_INHERIT is + * set. Lets go for the minimal fix first tho. + */ + evsel->attr.inherit = (cpus->map[cpu] >= 0) && inherit; for (thread = 0; thread < threads->nr; thread++) { From 8635bf6ea3402154eec64763e6ed14972013c1c1 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 22 Feb 2011 06:56:18 -0300 Subject: [PATCH 406/706] perf probe: Remove redundant checks While fixing an error propagating problem in f809b25 I added two redundant checks. I did that because I didn't expect the checks to be on the while and for loop condition expression, where they are tested before we run the loop, where the 'ret' variable is set. So remove it from there and leave it just after it is actually set, eliminating unneded tests. Reported-by: Masami Hiramatsu Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-event.c | 2 +- tools/perf/util/probe-finder.c | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 369ddc64bbb6..5ddee66020a7 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -1832,7 +1832,7 @@ int add_perf_probe_events(struct perf_probe_event *pevs, int npevs, } /* Loop 2: add all events */ - for (i = 0; i < npevs && ret >= 0; i++) { + for (i = 0; i < npevs; i++) { ret = __add_probe_trace_events(pkgs[i].pev, pkgs[i].tevs, pkgs[i].ntevs, force_add); if (ret < 0) diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index eecbdca5c0d5..17f9c4a66ddd 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -1462,8 +1462,7 @@ static int find_probes(int fd, struct probe_finder *pf) off = 0; line_list__init(&pf->lcache); /* Loop on CUs (Compilation Unit) */ - while (!dwarf_nextcu(dbg, off, &noff, &cuhl, NULL, NULL, NULL) && - ret >= 0) { + while (!dwarf_nextcu(dbg, off, &noff, &cuhl, NULL, NULL, NULL)) { /* Get the DIE(Debugging Information Entry) of this CU */ diep = dwarf_offdie(dbg, off + cuhl, &pf->cu_die); if (!diep) @@ -1484,7 +1483,7 @@ static int find_probes(int fd, struct probe_finder *pf) pf->lno = pp->line; ret = find_probe_point_by_line(pf); } - if (ret != DWARF_CB_OK) + if (ret < 0) break; } off = noff; From fbe99959d1db85222829a64d869dcab704ac7ec8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 22 Feb 2011 11:10:08 +0100 Subject: [PATCH 407/706] x86-64, NUMA: Prepare numa_emulation() for moving NUMA emulation into a separate file Update numa_emulation() such that, it - takes @numa_meminfo and @numa_dist_cnt instead of directly referencing the global variables. - copies the distance table by iterating each distance with node_distance() instead of memcpy'ing the distance table. - tests emu_cmdline to determine whether emulation is requested and fills emu_nid_to_phys[] with identity mapping if emulation is not used. This allows the caller to call numa_emulation() unconditionally and makes return value unncessary. - defines dummy version if CONFIG_NUMA_EMU is disabled. This patch doesn't introduce any behavior change. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Ingo Molnar --- arch/x86/mm/numa_64.c | 56 +++++++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 6e4ee96d1b11..980d51458c4b 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -789,17 +789,20 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, * Sets up the system RAM area from start_pfn to last_pfn according to the * numa=fake command-line option. */ -static bool __init numa_emulation(void) +static void __init numa_emulation(struct numa_meminfo *numa_meminfo, + int numa_dist_cnt) { static struct numa_meminfo ei __initdata; static struct numa_meminfo pi __initdata; const u64 max_addr = max_pfn << PAGE_SHIFT; - int phys_dist_cnt = numa_distance_cnt; u8 *phys_dist = NULL; int i, j, ret; + if (!emu_cmdline) + goto no_emu; + memset(&ei, 0, sizeof(ei)); - pi = numa_meminfo; + pi = *numa_meminfo; for (i = 0; i < MAX_NUMNODES; i++) emu_nid_to_phys[i] = NUMA_NO_NODE; @@ -822,19 +825,19 @@ static bool __init numa_emulation(void) } if (ret < 0) - return false; + goto no_emu; if (numa_cleanup_meminfo(&ei) < 0) { pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); - return false; + goto no_emu; } /* * Copy the original distance table. It's temporary so no need to * reserve it. */ - if (phys_dist_cnt) { - size_t size = phys_dist_cnt * sizeof(numa_distance[0]); + if (numa_dist_cnt) { + size_t size = numa_dist_cnt * sizeof(phys_dist[0]); u64 phys; phys = memblock_find_in_range(0, @@ -842,14 +845,18 @@ static bool __init numa_emulation(void) size, PAGE_SIZE); if (phys == MEMBLOCK_ERROR) { pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); - return false; + goto no_emu; } phys_dist = __va(phys); - memcpy(phys_dist, numa_distance, size); + + for (i = 0; i < numa_dist_cnt; i++) + for (j = 0; j < numa_dist_cnt; j++) + phys_dist[i * numa_dist_cnt + j] = + node_distance(i, j); } /* commit */ - numa_meminfo = ei; + *numa_meminfo = ei; /* * Transform __apicid_to_node table to use emulated nids by @@ -878,18 +885,27 @@ static bool __init numa_emulation(void) int physj = emu_nid_to_phys[j]; int dist; - if (physi >= phys_dist_cnt || physj >= phys_dist_cnt) + if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) dist = physi == physj ? LOCAL_DISTANCE : REMOTE_DISTANCE; else - dist = phys_dist[physi * phys_dist_cnt + physj]; + dist = phys_dist[physi * numa_dist_cnt + physj]; numa_set_distance(i, j, dist); } } - return true; + return; + +no_emu: + /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) + emu_nid_to_phys[i] = i; } -#endif /* CONFIG_NUMA_EMU */ +#else /* CONFIG_NUMA_EMU */ +static inline void numa_emulation(struct numa_meminfo *numa_meminfo, + int numa_dist_cnt) +{ } +#endif /* CONFIG_NUMA_EMU */ static int __init dummy_numa_init(void) { @@ -937,15 +953,9 @@ void __init initmem_init(void) if (numa_cleanup_meminfo(&numa_meminfo) < 0) continue; -#ifdef CONFIG_NUMA_EMU - /* - * If requested, try emulation. If emulation is not used, - * build identity emu_nid_to_phys[] for numa_add_cpu() - */ - if (!emu_cmdline || !numa_emulation()) - for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) - emu_nid_to_phys[j] = j; -#endif + + numa_emulation(&numa_meminfo, numa_distance_cnt); + if (numa_register_memblks(&numa_meminfo) < 0) continue; From b8ef9172b2aad7eeb1fcd37a9e632c7b24da1f64 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 22 Feb 2011 11:10:08 +0100 Subject: [PATCH 408/706] x86-64, NUMA: Move NUMA emulation into numa_emulation.c Create numa_emulation.c and move all NUMA emulation code there. The definitions of struct numa_memblk and numa_meminfo are moved to numa_64.h. Also, numa_remove_memblk_from(), numa_cleanup_meminfo(), numa_reset_distance() along with numa_emulation() are made global. - v2: Internal declarations moved to numa_internal.h as suggested by Yinghai. Signed-off-by: Tejun Heo Acked-by: Yinghai Lu Cc: Ingo Molnar --- arch/x86/mm/Makefile | 1 + arch/x86/mm/numa_64.c | 480 +---------------------------------- arch/x86/mm/numa_emulation.c | 452 +++++++++++++++++++++++++++++++++ arch/x86/mm/numa_internal.h | 31 +++ 4 files changed, 488 insertions(+), 476 deletions(-) create mode 100644 arch/x86/mm/numa_emulation.c create mode 100644 arch/x86/mm/numa_internal.h diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 09df2f9a3d69..3e608edf9958 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -25,6 +25,7 @@ obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o +obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 980d51458c4b..45a361b16a59 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -18,20 +18,10 @@ #include #include #include -#include #include #include -struct numa_memblk { - u64 start; - u64 end; - int nid; -}; - -struct numa_meminfo { - int nr_blks; - struct numa_memblk blk[NR_NODE_MEMBLKS]; -}; +#include "numa_internal.h" struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); @@ -215,7 +205,7 @@ static int __init numa_add_memblk_to(int nid, u64 start, u64 end, return 0; } -static void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) +void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) { mi->nr_blks--; memmove(&mi->blk[idx], &mi->blk[idx + 1], @@ -273,7 +263,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) node_set_online(nodeid); } -static int __init numa_cleanup_meminfo(struct numa_meminfo *mi) +int __init numa_cleanup_meminfo(struct numa_meminfo *mi) { const u64 low = 0; const u64 high = (u64)max_pfn << PAGE_SHIFT; @@ -367,7 +357,7 @@ static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, * Reset distance table. The current table is freed. The next * numa_set_distance() call will create a new one. */ -static void __init numa_reset_distance(void) +void __init numa_reset_distance(void) { size_t size; @@ -525,388 +515,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) return 0; } -#ifdef CONFIG_NUMA_EMU -/* Numa emulation */ -static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata; -static char *emu_cmdline __initdata; - -void __init numa_emu_cmdline(char *str) -{ - emu_cmdline = str; -} - -static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) -{ - int i; - - for (i = 0; i < mi->nr_blks; i++) - if (mi->blk[i].nid == nid) - return i; - return -ENOENT; -} - -/* - * Sets up nid to range from @start to @end. The return value is -errno if - * something went wrong, 0 otherwise. - */ -static int __init emu_setup_memblk(struct numa_meminfo *ei, - struct numa_meminfo *pi, - int nid, int phys_blk, u64 size) -{ - struct numa_memblk *eb = &ei->blk[ei->nr_blks]; - struct numa_memblk *pb = &pi->blk[phys_blk]; - - if (ei->nr_blks >= NR_NODE_MEMBLKS) { - pr_err("NUMA: Too many emulated memblks, failing emulation\n"); - return -EINVAL; - } - - ei->nr_blks++; - eb->start = pb->start; - eb->end = pb->start + size; - eb->nid = nid; - - if (emu_nid_to_phys[nid] == NUMA_NO_NODE) - emu_nid_to_phys[nid] = pb->nid; - - pb->start += size; - if (pb->start >= pb->end) { - WARN_ON_ONCE(pb->start > pb->end); - numa_remove_memblk_from(phys_blk, pi); - } - - printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, - eb->start, eb->end, (eb->end - eb->start) >> 20); - return 0; -} - -/* - * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr - * to max_addr. The return value is the number of nodes allocated. - */ -static int __init split_nodes_interleave(struct numa_meminfo *ei, - struct numa_meminfo *pi, - u64 addr, u64 max_addr, int nr_nodes) -{ - nodemask_t physnode_mask = NODE_MASK_NONE; - u64 size; - int big; - int nid = 0; - int i, ret; - - if (nr_nodes <= 0) - return -1; - if (nr_nodes > MAX_NUMNODES) { - pr_info("numa=fake=%d too large, reducing to %d\n", - nr_nodes, MAX_NUMNODES); - nr_nodes = MAX_NUMNODES; - } - - size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes; - /* - * Calculate the number of big nodes that can be allocated as a result - * of consolidating the remainder. - */ - big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / - FAKE_NODE_MIN_SIZE; - - size &= FAKE_NODE_MIN_HASH_MASK; - if (!size) { - pr_err("Not enough memory for each node. " - "NUMA emulation disabled.\n"); - return -1; - } - - for (i = 0; i < pi->nr_blks; i++) - node_set(pi->blk[i].nid, physnode_mask); - - /* - * Continue to fill physical nodes with fake nodes until there is no - * memory left on any of them. - */ - while (nodes_weight(physnode_mask)) { - for_each_node_mask(i, physnode_mask) { - u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); - u64 start, limit, end; - int phys_blk; - - phys_blk = emu_find_memblk_by_nid(i, pi); - if (phys_blk < 0) { - node_clear(i, physnode_mask); - continue; - } - start = pi->blk[phys_blk].start; - limit = pi->blk[phys_blk].end; - end = start + size; - - if (nid < big) - end += FAKE_NODE_MIN_SIZE; - - /* - * Continue to add memory to this fake node if its - * non-reserved memory is less than the per-node size. - */ - while (end - start - - memblock_x86_hole_size(start, end) < size) { - end += FAKE_NODE_MIN_SIZE; - if (end > limit) { - end = limit; - break; - } - } - - /* - * If there won't be at least FAKE_NODE_MIN_SIZE of - * non-reserved memory in ZONE_DMA32 for the next node, - * this one must extend to the boundary. - */ - if (end < dma32_end && dma32_end - end - - memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) - end = dma32_end; - - /* - * If there won't be enough non-reserved memory for the - * next node, this one must extend to the end of the - * physical node. - */ - if (limit - end - - memblock_x86_hole_size(end, limit) < size) - end = limit; - - ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, - phys_blk, - min(end, limit) - start); - if (ret < 0) - return ret; - } - } - return 0; -} - -/* - * Returns the end address of a node so that there is at least `size' amount of - * non-reserved memory or `max_addr' is reached. - */ -static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) -{ - u64 end = start + size; - - while (end - start - memblock_x86_hole_size(start, end) < size) { - end += FAKE_NODE_MIN_SIZE; - if (end > max_addr) { - end = max_addr; - break; - } - } - return end; -} - -/* - * Sets up fake nodes of `size' interleaved over physical nodes ranging from - * `addr' to `max_addr'. The return value is the number of nodes allocated. - */ -static int __init split_nodes_size_interleave(struct numa_meminfo *ei, - struct numa_meminfo *pi, - u64 addr, u64 max_addr, u64 size) -{ - nodemask_t physnode_mask = NODE_MASK_NONE; - u64 min_size; - int nid = 0; - int i, ret; - - if (!size) - return -1; - /* - * The limit on emulated nodes is MAX_NUMNODES, so the size per node is - * increased accordingly if the requested size is too small. This - * creates a uniform distribution of node sizes across the entire - * machine (but not necessarily over physical nodes). - */ - min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / - MAX_NUMNODES; - min_size = max(min_size, FAKE_NODE_MIN_SIZE); - if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) - min_size = (min_size + FAKE_NODE_MIN_SIZE) & - FAKE_NODE_MIN_HASH_MASK; - if (size < min_size) { - pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", - size >> 20, min_size >> 20); - size = min_size; - } - size &= FAKE_NODE_MIN_HASH_MASK; - - for (i = 0; i < pi->nr_blks; i++) - node_set(pi->blk[i].nid, physnode_mask); - - /* - * Fill physical nodes with fake nodes of size until there is no memory - * left on any of them. - */ - while (nodes_weight(physnode_mask)) { - for_each_node_mask(i, physnode_mask) { - u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT; - u64 start, limit, end; - int phys_blk; - - phys_blk = emu_find_memblk_by_nid(i, pi); - if (phys_blk < 0) { - node_clear(i, physnode_mask); - continue; - } - start = pi->blk[phys_blk].start; - limit = pi->blk[phys_blk].end; - - end = find_end_of_node(start, limit, size); - /* - * If there won't be at least FAKE_NODE_MIN_SIZE of - * non-reserved memory in ZONE_DMA32 for the next node, - * this one must extend to the boundary. - */ - if (end < dma32_end && dma32_end - end - - memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) - end = dma32_end; - - /* - * If there won't be enough non-reserved memory for the - * next node, this one must extend to the end of the - * physical node. - */ - if (limit - end - - memblock_x86_hole_size(end, limit) < size) - end = limit; - - ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, - phys_blk, - min(end, limit) - start); - if (ret < 0) - return ret; - } - } - return 0; -} - -/* - * Sets up the system RAM area from start_pfn to last_pfn according to the - * numa=fake command-line option. - */ -static void __init numa_emulation(struct numa_meminfo *numa_meminfo, - int numa_dist_cnt) -{ - static struct numa_meminfo ei __initdata; - static struct numa_meminfo pi __initdata; - const u64 max_addr = max_pfn << PAGE_SHIFT; - u8 *phys_dist = NULL; - int i, j, ret; - - if (!emu_cmdline) - goto no_emu; - - memset(&ei, 0, sizeof(ei)); - pi = *numa_meminfo; - - for (i = 0; i < MAX_NUMNODES; i++) - emu_nid_to_phys[i] = NUMA_NO_NODE; - - /* - * If the numa=fake command-line contains a 'M' or 'G', it represents - * the fixed node size. Otherwise, if it is just a single number N, - * split the system RAM into N fake nodes. - */ - if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { - u64 size; - - size = memparse(emu_cmdline, &emu_cmdline); - ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size); - } else { - unsigned long n; - - n = simple_strtoul(emu_cmdline, NULL, 0); - ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); - } - - if (ret < 0) - goto no_emu; - - if (numa_cleanup_meminfo(&ei) < 0) { - pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); - goto no_emu; - } - - /* - * Copy the original distance table. It's temporary so no need to - * reserve it. - */ - if (numa_dist_cnt) { - size_t size = numa_dist_cnt * sizeof(phys_dist[0]); - u64 phys; - - phys = memblock_find_in_range(0, - (u64)max_pfn_mapped << PAGE_SHIFT, - size, PAGE_SIZE); - if (phys == MEMBLOCK_ERROR) { - pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); - goto no_emu; - } - phys_dist = __va(phys); - - for (i = 0; i < numa_dist_cnt; i++) - for (j = 0; j < numa_dist_cnt; j++) - phys_dist[i * numa_dist_cnt + j] = - node_distance(i, j); - } - - /* commit */ - *numa_meminfo = ei; - - /* - * Transform __apicid_to_node table to use emulated nids by - * reverse-mapping phys_nid. The maps should always exist but fall - * back to zero just in case. - */ - for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) { - if (__apicid_to_node[i] == NUMA_NO_NODE) - continue; - for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) - if (__apicid_to_node[i] == emu_nid_to_phys[j]) - break; - __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0; - } - - /* make sure all emulated nodes are mapped to a physical node */ - for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) - if (emu_nid_to_phys[i] == NUMA_NO_NODE) - emu_nid_to_phys[i] = 0; - - /* transform distance table */ - numa_reset_distance(); - for (i = 0; i < MAX_NUMNODES; i++) { - for (j = 0; j < MAX_NUMNODES; j++) { - int physi = emu_nid_to_phys[i]; - int physj = emu_nid_to_phys[j]; - int dist; - - if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) - dist = physi == physj ? - LOCAL_DISTANCE : REMOTE_DISTANCE; - else - dist = phys_dist[physi * numa_dist_cnt + physj]; - - numa_set_distance(i, j, dist); - } - } - return; - -no_emu: - /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ - for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) - emu_nid_to_phys[i] = i; -} -#else /* CONFIG_NUMA_EMU */ -static inline void numa_emulation(struct numa_meminfo *numa_meminfo, - int numa_dist_cnt) -{ } -#endif /* CONFIG_NUMA_EMU */ - static int __init dummy_numa_init(void) { printk(KERN_INFO "%s\n", @@ -994,83 +602,3 @@ int __cpuinit numa_cpu_node(int cpu) return __apicid_to_node[apicid]; return NUMA_NO_NODE; } - -/* - * UGLINESS AHEAD: Currently, CONFIG_NUMA_EMU is 64bit only and makes use - * of 64bit specific data structures. The distinction is artificial and - * should be removed. numa_{add|remove}_cpu() are implemented in numa.c - * for both 32 and 64bit when CONFIG_NUMA_EMU is disabled but here when - * enabled. - * - * NUMA emulation is planned to be made generic and the following and other - * related code should be moved to numa.c. - */ -#ifdef CONFIG_NUMA_EMU -# ifndef CONFIG_DEBUG_PER_CPU_MAPS -void __cpuinit numa_add_cpu(int cpu) -{ - int physnid, nid; - - nid = numa_cpu_node(cpu); - if (nid == NUMA_NO_NODE) - nid = early_cpu_to_node(cpu); - BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); - - physnid = emu_nid_to_phys[nid]; - - /* - * Map the cpu to each emulated node that is allocated on the physical - * node of the cpu's apic id. - */ - for_each_online_node(nid) - if (emu_nid_to_phys[nid] == physnid) - cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); -} - -void __cpuinit numa_remove_cpu(int cpu) -{ - int i; - - for_each_online_node(i) - cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); -} -# else /* !CONFIG_DEBUG_PER_CPU_MAPS */ -static void __cpuinit numa_set_cpumask(int cpu, int enable) -{ - struct cpumask *mask; - int nid, physnid, i; - - nid = early_cpu_to_node(cpu); - if (nid == NUMA_NO_NODE) { - /* early_cpu_to_node() already emits a warning and trace */ - return; - } - - physnid = emu_nid_to_phys[nid]; - - for_each_online_node(i) { - if (emu_nid_to_phys[nid] != physnid) - continue; - - mask = debug_cpumask_set_cpu(cpu, enable); - if (!mask) - return; - - if (enable) - cpumask_set_cpu(cpu, mask); - else - cpumask_clear_cpu(cpu, mask); - } -} - -void __cpuinit numa_add_cpu(int cpu) -{ - numa_set_cpumask(cpu, 1); -} - -void __cpuinit numa_remove_cpu(int cpu) -{ - numa_set_cpumask(cpu, 0); -} -# endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ -#endif /* CONFIG_NUMA_EMU */ diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c new file mode 100644 index 000000000000..23fa2d00253a --- /dev/null +++ b/arch/x86/mm/numa_emulation.c @@ -0,0 +1,452 @@ +/* + * NUMA emulation + */ +#include +#include +#include +#include +#include + +#include "numa_internal.h" + +static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata; +static char *emu_cmdline __initdata; + +void __init numa_emu_cmdline(char *str) +{ + emu_cmdline = str; +} + +static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) +{ + int i; + + for (i = 0; i < mi->nr_blks; i++) + if (mi->blk[i].nid == nid) + return i; + return -ENOENT; +} + +/* + * Sets up nid to range from @start to @end. The return value is -errno if + * something went wrong, 0 otherwise. + */ +static int __init emu_setup_memblk(struct numa_meminfo *ei, + struct numa_meminfo *pi, + int nid, int phys_blk, u64 size) +{ + struct numa_memblk *eb = &ei->blk[ei->nr_blks]; + struct numa_memblk *pb = &pi->blk[phys_blk]; + + if (ei->nr_blks >= NR_NODE_MEMBLKS) { + pr_err("NUMA: Too many emulated memblks, failing emulation\n"); + return -EINVAL; + } + + ei->nr_blks++; + eb->start = pb->start; + eb->end = pb->start + size; + eb->nid = nid; + + if (emu_nid_to_phys[nid] == NUMA_NO_NODE) + emu_nid_to_phys[nid] = pb->nid; + + pb->start += size; + if (pb->start >= pb->end) { + WARN_ON_ONCE(pb->start > pb->end); + numa_remove_memblk_from(phys_blk, pi); + } + + printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, + eb->start, eb->end, (eb->end - eb->start) >> 20); + return 0; +} + +/* + * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr + * to max_addr. The return value is the number of nodes allocated. + */ +static int __init split_nodes_interleave(struct numa_meminfo *ei, + struct numa_meminfo *pi, + u64 addr, u64 max_addr, int nr_nodes) +{ + nodemask_t physnode_mask = NODE_MASK_NONE; + u64 size; + int big; + int nid = 0; + int i, ret; + + if (nr_nodes <= 0) + return -1; + if (nr_nodes > MAX_NUMNODES) { + pr_info("numa=fake=%d too large, reducing to %d\n", + nr_nodes, MAX_NUMNODES); + nr_nodes = MAX_NUMNODES; + } + + size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes; + /* + * Calculate the number of big nodes that can be allocated as a result + * of consolidating the remainder. + */ + big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / + FAKE_NODE_MIN_SIZE; + + size &= FAKE_NODE_MIN_HASH_MASK; + if (!size) { + pr_err("Not enough memory for each node. " + "NUMA emulation disabled.\n"); + return -1; + } + + for (i = 0; i < pi->nr_blks; i++) + node_set(pi->blk[i].nid, physnode_mask); + + /* + * Continue to fill physical nodes with fake nodes until there is no + * memory left on any of them. + */ + while (nodes_weight(physnode_mask)) { + for_each_node_mask(i, physnode_mask) { + u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); + u64 start, limit, end; + int phys_blk; + + phys_blk = emu_find_memblk_by_nid(i, pi); + if (phys_blk < 0) { + node_clear(i, physnode_mask); + continue; + } + start = pi->blk[phys_blk].start; + limit = pi->blk[phys_blk].end; + end = start + size; + + if (nid < big) + end += FAKE_NODE_MIN_SIZE; + + /* + * Continue to add memory to this fake node if its + * non-reserved memory is less than the per-node size. + */ + while (end - start - + memblock_x86_hole_size(start, end) < size) { + end += FAKE_NODE_MIN_SIZE; + if (end > limit) { + end = limit; + break; + } + } + + /* + * If there won't be at least FAKE_NODE_MIN_SIZE of + * non-reserved memory in ZONE_DMA32 for the next node, + * this one must extend to the boundary. + */ + if (end < dma32_end && dma32_end - end - + memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + end = dma32_end; + + /* + * If there won't be enough non-reserved memory for the + * next node, this one must extend to the end of the + * physical node. + */ + if (limit - end - + memblock_x86_hole_size(end, limit) < size) + end = limit; + + ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, + phys_blk, + min(end, limit) - start); + if (ret < 0) + return ret; + } + } + return 0; +} + +/* + * Returns the end address of a node so that there is at least `size' amount of + * non-reserved memory or `max_addr' is reached. + */ +static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) +{ + u64 end = start + size; + + while (end - start - memblock_x86_hole_size(start, end) < size) { + end += FAKE_NODE_MIN_SIZE; + if (end > max_addr) { + end = max_addr; + break; + } + } + return end; +} + +/* + * Sets up fake nodes of `size' interleaved over physical nodes ranging from + * `addr' to `max_addr'. The return value is the number of nodes allocated. + */ +static int __init split_nodes_size_interleave(struct numa_meminfo *ei, + struct numa_meminfo *pi, + u64 addr, u64 max_addr, u64 size) +{ + nodemask_t physnode_mask = NODE_MASK_NONE; + u64 min_size; + int nid = 0; + int i, ret; + + if (!size) + return -1; + /* + * The limit on emulated nodes is MAX_NUMNODES, so the size per node is + * increased accordingly if the requested size is too small. This + * creates a uniform distribution of node sizes across the entire + * machine (but not necessarily over physical nodes). + */ + min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / + MAX_NUMNODES; + min_size = max(min_size, FAKE_NODE_MIN_SIZE); + if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) + min_size = (min_size + FAKE_NODE_MIN_SIZE) & + FAKE_NODE_MIN_HASH_MASK; + if (size < min_size) { + pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", + size >> 20, min_size >> 20); + size = min_size; + } + size &= FAKE_NODE_MIN_HASH_MASK; + + for (i = 0; i < pi->nr_blks; i++) + node_set(pi->blk[i].nid, physnode_mask); + + /* + * Fill physical nodes with fake nodes of size until there is no memory + * left on any of them. + */ + while (nodes_weight(physnode_mask)) { + for_each_node_mask(i, physnode_mask) { + u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT; + u64 start, limit, end; + int phys_blk; + + phys_blk = emu_find_memblk_by_nid(i, pi); + if (phys_blk < 0) { + node_clear(i, physnode_mask); + continue; + } + start = pi->blk[phys_blk].start; + limit = pi->blk[phys_blk].end; + + end = find_end_of_node(start, limit, size); + /* + * If there won't be at least FAKE_NODE_MIN_SIZE of + * non-reserved memory in ZONE_DMA32 for the next node, + * this one must extend to the boundary. + */ + if (end < dma32_end && dma32_end - end - + memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + end = dma32_end; + + /* + * If there won't be enough non-reserved memory for the + * next node, this one must extend to the end of the + * physical node. + */ + if (limit - end - + memblock_x86_hole_size(end, limit) < size) + end = limit; + + ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, + phys_blk, + min(end, limit) - start); + if (ret < 0) + return ret; + } + } + return 0; +} + +/* + * Sets up the system RAM area from start_pfn to last_pfn according to the + * numa=fake command-line option. + */ +void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) +{ + static struct numa_meminfo ei __initdata; + static struct numa_meminfo pi __initdata; + const u64 max_addr = max_pfn << PAGE_SHIFT; + u8 *phys_dist = NULL; + int i, j, ret; + + if (!emu_cmdline) + goto no_emu; + + memset(&ei, 0, sizeof(ei)); + pi = *numa_meminfo; + + for (i = 0; i < MAX_NUMNODES; i++) + emu_nid_to_phys[i] = NUMA_NO_NODE; + + /* + * If the numa=fake command-line contains a 'M' or 'G', it represents + * the fixed node size. Otherwise, if it is just a single number N, + * split the system RAM into N fake nodes. + */ + if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { + u64 size; + + size = memparse(emu_cmdline, &emu_cmdline); + ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size); + } else { + unsigned long n; + + n = simple_strtoul(emu_cmdline, NULL, 0); + ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); + } + + if (ret < 0) + goto no_emu; + + if (numa_cleanup_meminfo(&ei) < 0) { + pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); + goto no_emu; + } + + /* + * Copy the original distance table. It's temporary so no need to + * reserve it. + */ + if (numa_dist_cnt) { + size_t size = numa_dist_cnt * sizeof(phys_dist[0]); + u64 phys; + + phys = memblock_find_in_range(0, + (u64)max_pfn_mapped << PAGE_SHIFT, + size, PAGE_SIZE); + if (phys == MEMBLOCK_ERROR) { + pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); + goto no_emu; + } + phys_dist = __va(phys); + + for (i = 0; i < numa_dist_cnt; i++) + for (j = 0; j < numa_dist_cnt; j++) + phys_dist[i * numa_dist_cnt + j] = + node_distance(i, j); + } + + /* commit */ + *numa_meminfo = ei; + + /* + * Transform __apicid_to_node table to use emulated nids by + * reverse-mapping phys_nid. The maps should always exist but fall + * back to zero just in case. + */ + for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) { + if (__apicid_to_node[i] == NUMA_NO_NODE) + continue; + for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) + if (__apicid_to_node[i] == emu_nid_to_phys[j]) + break; + __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0; + } + + /* make sure all emulated nodes are mapped to a physical node */ + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) + if (emu_nid_to_phys[i] == NUMA_NO_NODE) + emu_nid_to_phys[i] = 0; + + /* transform distance table */ + numa_reset_distance(); + for (i = 0; i < MAX_NUMNODES; i++) { + for (j = 0; j < MAX_NUMNODES; j++) { + int physi = emu_nid_to_phys[i]; + int physj = emu_nid_to_phys[j]; + int dist; + + if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) + dist = physi == physj ? + LOCAL_DISTANCE : REMOTE_DISTANCE; + else + dist = phys_dist[physi * numa_dist_cnt + physj]; + + numa_set_distance(i, j, dist); + } + } + return; + +no_emu: + /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) + emu_nid_to_phys[i] = i; +} + +#ifndef CONFIG_DEBUG_PER_CPU_MAPS +void __cpuinit numa_add_cpu(int cpu) +{ + int physnid, nid; + + nid = numa_cpu_node(cpu); + if (nid == NUMA_NO_NODE) + nid = early_cpu_to_node(cpu); + BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); + + physnid = emu_nid_to_phys[nid]; + + /* + * Map the cpu to each emulated node that is allocated on the physical + * node of the cpu's apic id. + */ + for_each_online_node(nid) + if (emu_nid_to_phys[nid] == physnid) + cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + int i; + + for_each_online_node(i) + cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); +} +#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ +static void __cpuinit numa_set_cpumask(int cpu, int enable) +{ + struct cpumask *mask; + int nid, physnid, i; + + nid = early_cpu_to_node(cpu); + if (nid == NUMA_NO_NODE) { + /* early_cpu_to_node() already emits a warning and trace */ + return; + } + + physnid = emu_nid_to_phys[nid]; + + for_each_online_node(i) { + if (emu_nid_to_phys[nid] != physnid) + continue; + + mask = debug_cpumask_set_cpu(cpu, enable); + if (!mask) + return; + + if (enable) + cpumask_set_cpu(cpu, mask); + else + cpumask_clear_cpu(cpu, mask); + } +} + +void __cpuinit numa_add_cpu(int cpu) +{ + numa_set_cpumask(cpu, 1); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + numa_set_cpumask(cpu, 0); +} +#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h new file mode 100644 index 000000000000..ef2d97377d7c --- /dev/null +++ b/arch/x86/mm/numa_internal.h @@ -0,0 +1,31 @@ +#ifndef __X86_MM_NUMA_INTERNAL_H +#define __X86_MM_NUMA_INTERNAL_H + +#include +#include + +struct numa_memblk { + u64 start; + u64 end; + int nid; +}; + +struct numa_meminfo { + int nr_blks; + struct numa_memblk blk[NR_NODE_MEMBLKS]; +}; + +void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi); +int __init numa_cleanup_meminfo(struct numa_meminfo *mi); +void __init numa_reset_distance(void); + +#ifdef CONFIG_NUMA_EMU +void __init numa_emulation(struct numa_meminfo *numa_meminfo, + int numa_dist_cnt); +#else +static inline void numa_emulation(struct numa_meminfo *numa_meminfo, + int numa_dist_cnt) +{ } +#endif + +#endif /* __X86_MM_NUMA_INTERNAL_H */ From 90e6b677b47ff8c5ba1637941af6b9f92723b003 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 22 Feb 2011 11:10:08 +0100 Subject: [PATCH 409/706] x86-64, NUMA: Add proper function comments to global functions Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Ingo Molnar --- arch/x86/mm/numa_64.c | 50 +++++++++++++++++++++++++++++++----- arch/x86/mm/numa_emulation.c | 29 ++++++++++++++++++--- 2 files changed, 69 insertions(+), 10 deletions(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 45a361b16a59..848381bdd358 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -205,6 +205,14 @@ static int __init numa_add_memblk_to(int nid, u64 start, u64 end, return 0; } +/** + * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo + * @idx: Index of memblk to remove + * @mi: numa_meminfo to remove memblk from + * + * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and + * decrementing @mi->nr_blks. + */ void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) { mi->nr_blks--; @@ -212,6 +220,17 @@ void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) (mi->nr_blks - idx) * sizeof(mi->blk[0])); } +/** + * numa_add_memblk - Add one numa_memblk to numa_meminfo + * @nid: NUMA node ID of the new memblk + * @start: Start address of the new memblk + * @end: End address of the new memblk + * + * Add a new memblk to the default numa_meminfo. + * + * RETURNS: + * 0 on success, -errno on failure. + */ int __init numa_add_memblk(int nid, u64 start, u64 end) { return numa_add_memblk_to(nid, start, end, &numa_meminfo); @@ -263,6 +282,16 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) node_set_online(nodeid); } +/** + * numa_cleanup_meminfo - Cleanup a numa_meminfo + * @mi: numa_meminfo to clean up + * + * Sanitize @mi by merging and removing unncessary memblks. Also check for + * conflicts and clear unused memblks. + * + * RETURNS: + * 0 on success, -errno on failure. + */ int __init numa_cleanup_meminfo(struct numa_meminfo *mi) { const u64 low = 0; @@ -353,9 +382,11 @@ static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, node_set(mi->blk[i].nid, *nodemask); } -/* - * Reset distance table. The current table is freed. The next - * numa_set_distance() call will create a new one. +/** + * numa_reset_distance - Reset NUMA distance table + * + * The current table is freed. The next numa_set_distance() call will + * create a new one. */ void __init numa_reset_distance(void) { @@ -370,10 +401,15 @@ void __init numa_reset_distance(void) numa_distance = NULL; } -/* - * Set the distance between node @from to @to to @distance. If distance - * table doesn't exist, one which is large enough to accomodate all the - * currently known nodes will be created. +/** + * numa_set_distance - Set NUMA distance from one NUMA to another + * @from: the 'from' node to set distance + * @to: the 'to' node to set distance + * @distance: NUMA distance + * + * Set the distance from node @from to @to to @distance. If distance table + * doesn't exist, one which is large enough to accomodate all the currently + * known nodes will be created. */ void __init numa_set_distance(int from, int to, int distance) { diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index 23fa2d00253a..607a2e8bc87a 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -267,9 +267,32 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, return 0; } -/* - * Sets up the system RAM area from start_pfn to last_pfn according to the - * numa=fake command-line option. +/** + * numa_emulation - Emulate NUMA nodes + * @numa_meminfo: NUMA configuration to massage + * @numa_dist_cnt: The size of the physical NUMA distance table + * + * Emulate NUMA nodes according to the numa=fake kernel parameter. + * @numa_meminfo contains the physical memory configuration and is modified + * to reflect the emulated configuration on success. @numa_dist_cnt is + * used to determine the size of the physical distance table. + * + * On success, the following modifications are made. + * + * - @numa_meminfo is updated to reflect the emulated nodes. + * + * - __apicid_to_node[] is updated such that APIC IDs are mapped to the + * emulated nodes. + * + * - NUMA distance table is rebuilt to represent distances between emulated + * nodes. The distances are determined considering how emulated nodes + * are mapped to physical nodes and match the actual distances. + * + * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical + * nodes. This is used by numa_add_cpu() and numa_remove_cpu(). + * + * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with + * identity mapping and no other modification is made. */ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) { From 2bf50555b0920be7e29d3823f6bbd20ee5920489 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 22 Feb 2011 11:18:49 +0100 Subject: [PATCH 410/706] x86-64, NUMA: Seperate out numa_alloc_distance() from numa_set_distance() Alloc code is much bigger the distance setting. Separate it out into numa_alloc_distance() for readability. -v2: Let alloc_numa_distance to return -ENOMEM on failing path, requested by tj. -tj: Description update. Minor tweaks including function name, location and return value check. Signed-off-by: Yinghai Lu Acked-by: David Rientjes Signed-off-by: Tejun Heo --- arch/x86/mm/numa_64.c | 75 +++++++++++++++++++++++-------------------- 1 file changed, 40 insertions(+), 35 deletions(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 848381bdd358..cccc01d8415c 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -401,6 +401,44 @@ void __init numa_reset_distance(void) numa_distance = NULL; } +static int __init numa_alloc_distance(void) +{ + nodemask_t nodes_parsed; + size_t size; + int i, j, cnt = 0; + u64 phys; + + /* size the new table and allocate it */ + nodes_parsed = numa_nodes_parsed; + numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); + + for_each_node_mask(i, nodes_parsed) + cnt = i; + size = ++cnt * sizeof(numa_distance[0]); + + phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT, + size, PAGE_SIZE); + if (phys == MEMBLOCK_ERROR) { + pr_warning("NUMA: Warning: can't allocate distance table!\n"); + /* don't retry until explicitly reset */ + numa_distance = (void *)1LU; + return -ENOMEM; + } + memblock_x86_reserve_range(phys, phys + size, "NUMA DIST"); + + numa_distance = __va(phys); + numa_distance_cnt = cnt; + + /* fill with the default distances */ + for (i = 0; i < cnt; i++) + for (j = 0; j < cnt; j++) + numa_distance[i * cnt + j] = i == j ? + LOCAL_DISTANCE : REMOTE_DISTANCE; + printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); + + return 0; +} + /** * numa_set_distance - Set NUMA distance from one NUMA to another * @from: the 'from' node to set distance @@ -413,41 +451,8 @@ void __init numa_reset_distance(void) */ void __init numa_set_distance(int from, int to, int distance) { - if (!numa_distance) { - nodemask_t nodes_parsed; - size_t size; - int i, j, cnt = 0; - u64 phys; - - /* size the new table and allocate it */ - nodes_parsed = numa_nodes_parsed; - numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); - - for_each_node_mask(i, nodes_parsed) - cnt = i; - size = ++cnt * sizeof(numa_distance[0]); - - phys = memblock_find_in_range(0, - (u64)max_pfn_mapped << PAGE_SHIFT, - size, PAGE_SIZE); - if (phys == MEMBLOCK_ERROR) { - pr_warning("NUMA: Warning: can't allocate distance table!\n"); - /* don't retry until explicitly reset */ - numa_distance = (void *)1LU; - return; - } - memblock_x86_reserve_range(phys, phys + size, "NUMA DIST"); - - numa_distance = __va(phys); - numa_distance_cnt = cnt; - - /* fill with the default distances */ - for (i = 0; i < cnt; i++) - for (j = 0; j < cnt; j++) - numa_distance[i * cnt + j] = i == j ? - LOCAL_DISTANCE : REMOTE_DISTANCE; - printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); - } + if (!numa_distance && numa_alloc_distance() < 0) + return; if (from >= numa_distance_cnt || to >= numa_distance_cnt) { printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n", From 70433c01613c2a44756c7b25f7bdd6c1c77b119f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 22 Feb 2011 12:50:12 +0100 Subject: [PATCH 411/706] genirq: Use the correct variable for note_interrupt note_interrupt wants to be called with the combined result of all handlers called, not with the last one. If it's a shared interrupt then the last handler might return IRQ_NONE often enough to trigger the spurious dectector which turns off a perfectly fine working interrupt line. Bug was introduced in commit 1277a532(genirq: Simplify handle_irq_event()). Yes, I really messed up there. First the variable ret should not have been named differently to avoid similarity with retval. Second it should have been declared in the do {} loop. Rename it to res and move it into the do {} loop and vanish under a huge brown paperbag. Reported-bisected-tested-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/irq/handle.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index cb62e2d0df4e..e099e9e9de0b 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -54,24 +54,26 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action) irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) { - irqreturn_t ret, retval = IRQ_NONE; + irqreturn_t retval = IRQ_NONE; unsigned int random = 0, irq = desc->irq_data.irq; do { + irqreturn_t res; + trace_irq_handler_entry(irq, action); - ret = action->handler(irq, action->dev_id); - trace_irq_handler_exit(irq, action, ret); + res = action->handler(irq, action->dev_id); + trace_irq_handler_exit(irq, action, res); if (WARN_ON_ONCE(!irqs_disabled())) local_irq_disable(); - switch (ret) { + switch (res) { case IRQ_WAKE_THREAD: /* * Set result to handled so the spurious check * does not trigger. */ - ret = IRQ_HANDLED; + res = IRQ_HANDLED; /* * Catch drivers which return WAKE_THREAD but @@ -105,7 +107,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) break; } - retval |= ret; + retval |= res; action = action->next; } while (action); @@ -113,7 +115,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) add_interrupt_randomness(irq); if (!noirqdebug) - note_interrupt(irq, desc, ret); + note_interrupt(irq, desc, retval); return retval; } From c97cf42219b7b6037d2f96c27a5f114f2383f828 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 22 Feb 2011 12:02:07 -0300 Subject: [PATCH 412/706] perf top: Live TUI Annotation Now one has just to press the right key, 'a' or Enter on the main 'perf top --tui' screen to live annotate the symbol under the cursor. The annotate window starts centered on the hottest line (the one with most samples so far) then TAB and shift+TAB can be used to go to the prev/next hot line. Pressing 'H' at any point will center again the screen on the hottest line. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-top.c | 36 +++---- tools/perf/util/annotate.h | 6 +- tools/perf/util/top.h | 1 + tools/perf/util/ui/browsers/annotate.c | 126 ++++++++++++++++++------- tools/perf/util/ui/browsers/top.c | 45 ++++++++- 5 files changed, 156 insertions(+), 58 deletions(-) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index c9fd66d4a082..f88a2630e1fc 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -92,7 +92,6 @@ static bool dump_symtab = false; static struct winsize winsize; static const char *sym_filter = NULL; -struct sym_entry *sym_filter_entry = NULL; struct sym_entry *sym_filter_entry_sched = NULL; static int sym_pcnt_filter = 5; @@ -168,18 +167,19 @@ static int parse_source(struct sym_entry *syme) pthread_mutex_lock(¬es->lock); if (symbol__alloc_hist(sym, top.evlist->nr_entries) < 0) { + pthread_mutex_unlock(¬es->lock); pr_err("Not enough memory for annotating '%s' symbol!\n", sym->name); sleep(1); - goto out_unlock; + return err; } err = symbol__annotate(sym, syme->map, 0); if (err == 0) { out_assign: - sym_filter_entry = syme; + top.sym_filter_entry = syme; } -out_unlock: + pthread_mutex_unlock(¬es->lock); return err; } @@ -195,7 +195,7 @@ static void record_precise_ip(struct sym_entry *syme, int counter, u64 ip) struct annotation *notes; struct symbol *sym; - if (syme != sym_filter_entry) + if (syme != top.sym_filter_entry) return; sym = sym_entry__symbol(syme); @@ -275,8 +275,8 @@ static void print_sym_table(struct perf_session *session) session->hists.stats.total_lost); } - if (sym_filter_entry) { - show_details(sym_filter_entry); + if (top.sym_filter_entry) { + show_details(top.sym_filter_entry); return; } @@ -417,8 +417,8 @@ static void print_mapped_keys(void) { char *name = NULL; - if (sym_filter_entry) { - struct symbol *sym = sym_entry__symbol(sym_filter_entry); + if (top.sym_filter_entry) { + struct symbol *sym = sym_entry__symbol(top.sym_filter_entry); name = sym->name; } @@ -549,15 +549,15 @@ static void handle_keypress(struct perf_session *session, int c) perf_session__fprintf_dsos(session, stderr); exit(0); case 's': - prompt_symbol(&sym_filter_entry, "Enter details symbol"); + prompt_symbol(&top.sym_filter_entry, "Enter details symbol"); break; case 'S': - if (!sym_filter_entry) + if (!top.sym_filter_entry) break; else { - struct sym_entry *syme = sym_filter_entry; + struct sym_entry *syme = top.sym_filter_entry; - sym_filter_entry = NULL; + top.sym_filter_entry = NULL; __zero_source_counters(syme); } break; @@ -656,7 +656,7 @@ static int symbol_filter(struct map *map, struct symbol *sym) syme->map = map; symbol__annotate_init(map, sym); - if (!sym_filter_entry && sym_filter && !strcmp(name, sym_filter)) { + if (!top.sym_filter_entry && sym_filter && !strcmp(name, sym_filter)) { /* schedule initial sym_filter_entry setup */ sym_filter_entry_sched = syme; sym_filter = NULL; @@ -750,13 +750,13 @@ static void perf_event__process_sample(const union perf_event *event, /* let's see, whether we need to install initial sym_filter_entry */ if (sym_filter_entry_sched) { - sym_filter_entry = sym_filter_entry_sched; + top.sym_filter_entry = sym_filter_entry_sched; sym_filter_entry_sched = NULL; - if (parse_source(sym_filter_entry) < 0) { - struct symbol *sym = sym_entry__symbol(sym_filter_entry); + if (parse_source(top.sym_filter_entry) < 0) { + struct symbol *sym = sym_entry__symbol(top.sym_filter_entry); pr_err("Can't annotate %s", sym->name); - if (sym_filter_entry->map->dso->origin == DSO__ORIG_KERNEL) { + if (top.sym_filter_entry->map->dso->origin == DSO__ORIG_KERNEL) { pr_err(": No vmlinux file was found in the path:\n"); machine__fprintf_vmlinux_path(machine, stderr); } else diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index e848803fcd48..c2c286896801 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -90,12 +90,14 @@ int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx, #ifdef NO_NEWT_SUPPORT static inline int symbol__tui_annotate(struct symbol *sym __used, - struct map *map __used, int evidx __used) + struct map *map __used, + int evidx __used, int refresh __used) { return 0; } #else -int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx); +int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx, + int refresh); #endif #endif /* __PERF_ANNOTATE_H */ diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h index 4f769f47e19a..e8d28e2ecdba 100644 --- a/tools/perf/util/top.h +++ b/tools/perf/util/top.h @@ -44,6 +44,7 @@ struct perf_top { pid_t target_pid, target_tid; bool hide_kernel_symbols, hide_user_symbols, zero; const char *cpu_list; + struct sym_entry *sym_filter_entry; struct perf_evsel *sym_evsel; }; diff --git a/tools/perf/util/ui/browsers/annotate.c b/tools/perf/util/ui/browsers/annotate.c index cfb5a27f15d2..8c17a8730e4a 100644 --- a/tools/perf/util/ui/browsers/annotate.c +++ b/tools/perf/util/ui/browsers/annotate.c @@ -6,6 +6,7 @@ #include "../../sort.h" #include "../../symbol.h" #include "../../annotate.h" +#include static void ui__error_window(const char *fmt, ...) { @@ -138,46 +139,108 @@ static void annotate_browser__set_top(struct annotate_browser *self, self->curr_hot = nd; } -static int annotate_browser__run(struct annotate_browser *self) +static void annotate_browser__calc_percent(struct annotate_browser *browser, + int evidx) { - struct rb_node *nd; + struct symbol *sym = browser->b.priv; + struct annotation *notes = symbol__annotation(sym); + struct objdump_line *pos; + + browser->entries = RB_ROOT; + + pthread_mutex_lock(¬es->lock); + + list_for_each_entry(pos, ¬es->src->source, node) { + struct objdump_line_rb_node *rbpos = objdump_line__rb(pos); + rbpos->percent = objdump_line__calc_percent(pos, sym, evidx); + if (rbpos->percent < 0.01) { + RB_CLEAR_NODE(&rbpos->rb_node); + continue; + } + objdump__insert_line(&browser->entries, rbpos); + } + pthread_mutex_unlock(¬es->lock); + + browser->curr_hot = rb_last(&browser->entries); +} + +static int annotate_browser__run(struct annotate_browser *self, int evidx, + int refresh) +{ + struct rb_node *nd = NULL; struct symbol *sym = self->b.priv; + /* + * RIGHT To allow builtin-annotate to cycle thru multiple symbols by + * examining the exit key for this function. + */ + int exit_keys[] = { 'H', NEWT_KEY_TAB, NEWT_KEY_UNTAB, + NEWT_KEY_RIGHT, 0 }; int key; if (ui_browser__show(&self->b, sym->name, - "<-, -> or ESC: exit, TAB/shift+TAB: cycle thru samples") < 0) + "<-, -> or ESC: exit, TAB/shift+TAB: " + "cycle hottest lines, H: Hottest") < 0) return -1; - /* - * To allow builtin-annotate to cycle thru multiple symbols by - * examining the exit key for this function. - */ - ui_browser__add_exit_key(&self->b, NEWT_KEY_RIGHT); + + ui_browser__add_exit_keys(&self->b, exit_keys); + annotate_browser__calc_percent(self, evidx); + + if (self->curr_hot) + annotate_browser__set_top(self, self->curr_hot); nd = self->curr_hot; - if (nd) { - int tabs[] = { NEWT_KEY_TAB, NEWT_KEY_UNTAB, 0 }; - ui_browser__add_exit_keys(&self->b, tabs); - } + + if (refresh != 0) + newtFormSetTimer(self->b.form, refresh); while (1) { key = ui_browser__run(&self->b); + if (refresh != 0) { + annotate_browser__calc_percent(self, evidx); + /* + * Current line focus got out of the list of most active + * lines, NULL it so that if TAB|UNTAB is pressed, we + * move to curr_hot (current hottest line). + */ + if (nd != NULL && RB_EMPTY_NODE(nd)) + nd = NULL; + } + switch (key) { + case -1: + /* + * FIXME we need to check if it was + * es.reason == NEWT_EXIT_TIMER + */ + if (refresh != 0) + symbol__annotate_decay_histogram(sym, evidx); + continue; case NEWT_KEY_TAB: - nd = rb_prev(nd); - if (nd == NULL) - nd = rb_last(&self->entries); - annotate_browser__set_top(self, nd); + if (nd != NULL) { + nd = rb_prev(nd); + if (nd == NULL) + nd = rb_last(&self->entries); + } else + nd = self->curr_hot; break; case NEWT_KEY_UNTAB: - nd = rb_next(nd); - if (nd == NULL) - nd = rb_first(&self->entries); - annotate_browser__set_top(self, nd); + if (nd != NULL) + nd = rb_next(nd); + if (nd == NULL) + nd = rb_first(&self->entries); + else + nd = self->curr_hot; + break; + case 'H': + nd = self->curr_hot; break; default: goto out; } + + if (nd != NULL) + annotate_browser__set_top(self, nd); } out: ui_browser__hide(&self->b); @@ -186,13 +249,13 @@ out: int hist_entry__tui_annotate(struct hist_entry *he, int evidx) { - return symbol__tui_annotate(he->ms.sym, he->ms.map, evidx); + return symbol__tui_annotate(he->ms.sym, he->ms.map, evidx, 0); } -int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx) +int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx, + int refresh) { struct objdump_line *pos, *n; - struct objdump_line_rb_node *rbpos; struct annotation *notes = symbol__annotation(sym); struct annotate_browser browser = { .b = { @@ -211,7 +274,7 @@ int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx) if (map->dso->annotate_warned) return -1; - if (symbol__annotate(sym, map, sizeof(*rbpos)) < 0) { + if (symbol__annotate(sym, map, sizeof(struct objdump_line_rb_node)) < 0) { ui__error_window(ui_helpline__last_msg); return -1; } @@ -219,26 +282,17 @@ int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx) ui_helpline__push("Press <- or ESC to exit"); list_for_each_entry(pos, ¬es->src->source, node) { + struct objdump_line_rb_node *rbpos; size_t line_len = strlen(pos->line); + if (browser.b.width < line_len) browser.b.width = line_len; rbpos = objdump_line__rb(pos); rbpos->idx = browser.b.nr_entries++; - rbpos->percent = objdump_line__calc_percent(pos, sym, evidx); - if (rbpos->percent < 0.01) - continue; - objdump__insert_line(&browser.entries, rbpos); } - /* - * Position the browser at the hottest line. - */ - browser.curr_hot = rb_last(&browser.entries); - if (browser.curr_hot) - annotate_browser__set_top(&browser, browser.curr_hot); - browser.b.width += 18; /* Percentage */ - ret = annotate_browser__run(&browser); + ret = annotate_browser__run(&browser, evidx, refresh); list_for_each_entry_safe(pos, n, ¬es->src->source, node) { list_del(&pos->node); objdump_line__free(pos); diff --git a/tools/perf/util/ui/browsers/top.c b/tools/perf/util/ui/browsers/top.c index ca6062483a8f..377ff58c9ae9 100644 --- a/tools/perf/util/ui/browsers/top.c +++ b/tools/perf/util/ui/browsers/top.c @@ -7,6 +7,7 @@ * Released under the GPL v2. (and only v2, not any later version) */ #include "../browser.h" +#include "../../annotate.h" #include "../helpline.h" #include "../libslang.h" #include "../../evlist.h" @@ -18,6 +19,7 @@ struct perf_top_browser { struct ui_browser b; struct rb_root root; + struct sym_entry *selection; float sum_ksamples; int dso_width; int dso_short_width; @@ -60,6 +62,9 @@ static void perf_top_browser__write(struct ui_browser *browser, void *entry, int slsmg_write_nstring(width >= syme->map->dso->long_name_len ? syme->map->dso->long_name : syme->map->dso->short_name, width); + + if (current_entry) + top_browser->selection = syme; } static void perf_top_browser__update_rb_tree(struct perf_top_browser *browser) @@ -80,21 +85,52 @@ static void perf_top_browser__update_rb_tree(struct perf_top_browser *browser) browser->b.nr_entries = top->rb_entries; } +static void perf_top_browser__annotate(struct perf_top_browser *browser) +{ + struct sym_entry *syme = browser->selection; + struct symbol *sym = sym_entry__symbol(syme); + struct annotation *notes = symbol__annotation(sym); + struct perf_top *top = browser->b.priv; + + if (notes->src != NULL) + goto do_annotation; + + pthread_mutex_lock(¬es->lock); + + top->sym_filter_entry = NULL; + + if (symbol__alloc_hist(sym, top->evlist->nr_entries) < 0) { + pr_err("Not enough memory for annotating '%s' symbol!\n", + sym->name); + pthread_mutex_unlock(¬es->lock); + return; + } + + top->sym_filter_entry = syme; + + pthread_mutex_unlock(¬es->lock); +do_annotation: + symbol__tui_annotate(sym, syme->map, 0, top->delay_secs * 1000); +} + static int perf_top_browser__run(struct perf_top_browser *browser) { int key; char title[160]; struct perf_top *top = browser->b.priv; int delay_msecs = top->delay_secs * 1000; + int exit_keys[] = { 'a', NEWT_KEY_ENTER, NEWT_KEY_RIGHT, 0, }; perf_top_browser__update_rb_tree(browser); perf_top__header_snprintf(top, title, sizeof(title)); perf_top__reset_sample_counters(top); - if (ui_browser__show(&browser->b, title, "ESC: exit") < 0) + if (ui_browser__show(&browser->b, title, + "ESC: exit, ENTER|->|a: Live Annotate") < 0) return -1; newtFormSetTimer(browser->b.form, delay_msecs); + ui_browser__add_exit_keys(&browser->b, exit_keys); while (1) { key = ui_browser__run(&browser->b); @@ -109,7 +145,12 @@ static int perf_top_browser__run(struct perf_top_browser *browser) SLsmg_gotorc(0, 0); slsmg_write_nstring(title, browser->b.width); break; - case NEWT_KEY_TAB: + case 'a': + case NEWT_KEY_RIGHT: + case NEWT_KEY_ENTER: + if (browser->selection) + perf_top_browser__annotate(browser); + break; default: goto out; } From 2f14ddc3a7146ea4cd5a3d1ecd993f85f2e4f948 Mon Sep 17 00:00:00 2001 From: "Zhang, Fengzhe" Date: Wed, 16 Feb 2011 22:26:20 +0800 Subject: [PATCH 413/706] xen/setup: Inhibit resource API from using System RAM E820 gaps as PCI mem gaps. With the hypervisor argument of dom0_mem=X we iterate over the physical (only for the initial domain) E820 and subtract the the size from each E820_RAM region the delta so that the cumulative size of all E820_RAM regions is equal to 'X'. This sometimes ends up with E820_RAM regions with zero size (which are removed by e820_sanitize) and E820_RAM that are smaller than physically. Later on the PCI API looks at the E820 and attempts to set up an resource region for the "PCI mem". The E820 (assume dom0_mem=1GB is set) compared to the physical looks as so: [ 0.000000] BIOS-provided physical RAM map: [ 0.000000] Xen: 0000000000000000 - 0000000000097c00 (usable) [ 0.000000] Xen: 0000000000097c00 - 0000000000100000 (reserved) -[ 0.000000] Xen: 0000000000100000 - 00000000defafe00 (usable) +[ 0.000000] Xen: 0000000000100000 - 0000000040000000 (usable) [ 0.000000] Xen: 00000000defafe00 - 00000000defb1ea0 (ACPI NVS) [ 0.000000] Xen: 00000000defb1ea0 - 00000000e0000000 (reserved) [ 0.000000] Xen: 00000000f4000000 - 00000000f8000000 (reserved) .. And we get [ 0.000000] Allocating PCI resources starting at 40000000 (gap: 40000000:9efafe00) while it should have started at e0000000 (a nice big gap up to f4000000 exists). The "Allocating PCI" is part of the resource API. The users that end up using those PCI I/O regions usually supply their own BARs when calling the resource API (request_resource, or allocate_resource), but there are exceptions which provide an empty 'struct resource' and expect the API to provide the 'struct resource' to be populated with valid values. The one that triggered this bug was the intel AGP driver that requested a region for the flush page (intel_i9xx_setup_flush). Before this patch, when running under Xen hypervisor, the 'struct resource' returned could have (depending on the dom0_mem size) physical ranges of a 'System RAM' instead of 'I/O' regions. This ended up with the Hypervisor failing a request to populate PTE's with those PFNs as the domain did not have access to those 'System RAM' regions (rightly so). After this patch, the left-over E820_RAM region from the truncation, will be labeled as E820_UNUSABLE. The E820 will look as so: [ 0.000000] BIOS-provided physical RAM map: [ 0.000000] Xen: 0000000000000000 - 0000000000097c00 (usable) [ 0.000000] Xen: 0000000000097c00 - 0000000000100000 (reserved) -[ 0.000000] Xen: 0000000000100000 - 00000000defafe00 (usable) +[ 0.000000] Xen: 0000000000100000 - 0000000040000000 (usable) +[ 0.000000] Xen: 0000000040000000 - 00000000defafe00 (unusable) [ 0.000000] Xen: 00000000defafe00 - 00000000defb1ea0 (ACPI NVS) [ 0.000000] Xen: 00000000defb1ea0 - 00000000e0000000 (reserved) [ 0.000000] Xen: 00000000f4000000 - 00000000f8000000 (reserved) For more information: http://mid.gmane.org/1A42CE6F5F474C41B63392A5F80372B2335E978C@shsmsx501.ccr.corp.intel.com BugLink: http://bugzilla.xensource.com/bugzilla/show_bug.cgi?id=1726 Signed-off-by: Fengzhe Zhang Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index a8a66a50d446..2a4add9634ce 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -194,6 +194,14 @@ char * __init xen_memory_setup(void) end -= delta; extra_pages += PFN_DOWN(delta); + /* + * Set RAM below 4GB that is not for us to be unusable. + * This prevents "System RAM" address space from being + * used as potential resource for I/O address (happens + * when 'allocate_resource' is called). + */ + if (delta && end < 0x100000000UL) + e820_add_region(end, delta, E820_UNUSABLE); } if (map[i].size > 0 && end > xen_extra_mem_start) From dbebbfbb1605f0179e7c0d900d941cc9c45de569 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 22 Feb 2011 21:46:25 +0100 Subject: [PATCH 414/706] rtmutex: tester: Remove the remaining BKL leftovers We just leave the numbers assinged as commemoration and in case that someone was crazy enough to reimplement the test stuff out of tree. Signed-off-by: Thomas Gleixner --- kernel/rtmutex-tester.c | 5 ++--- scripts/rt-tester/rt-tester.py | 2 -- scripts/rt-tester/t2-l1-2rt-sameprio.tst | 5 ----- scripts/rt-tester/t2-l1-pi.tst | 5 ----- scripts/rt-tester/t2-l1-signal.tst | 5 ----- scripts/rt-tester/t2-l2-2rt-deadlock.tst | 5 ----- scripts/rt-tester/t3-l1-pi-1rt.tst | 5 ----- scripts/rt-tester/t3-l1-pi-2rt.tst | 5 ----- scripts/rt-tester/t3-l1-pi-3rt.tst | 5 ----- scripts/rt-tester/t3-l1-pi-signal.tst | 5 ----- scripts/rt-tester/t3-l1-pi-steal.tst | 5 ----- scripts/rt-tester/t3-l2-pi.tst | 5 ----- scripts/rt-tester/t4-l2-pi-deboost.tst | 5 ----- scripts/rt-tester/t5-l4-pi-boost-deboost-setsched.tst | 5 ----- scripts/rt-tester/t5-l4-pi-boost-deboost.tst | 5 ----- 15 files changed, 2 insertions(+), 70 deletions(-) diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index d5b543506cbc..5c9ccd380966 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -44,9 +44,8 @@ enum test_opcodes { RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ - RTTEST_LOCKBKL, /* 9 Was: Lock BKL */ - RTTEST_UNLOCKBKL, /* 10 Was: Unlock BKL */ - RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */ + /* 9, 10 - reserved for BKL commemoration */ + RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */ RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ RTTEST_RESET = 99, /* 99 Reset all pending operations */ }; diff --git a/scripts/rt-tester/rt-tester.py b/scripts/rt-tester/rt-tester.py index 44423b4dcb82..8c81d76959ee 100644 --- a/scripts/rt-tester/rt-tester.py +++ b/scripts/rt-tester/rt-tester.py @@ -33,8 +33,6 @@ cmd_opcodes = { "lockintnowait" : "6", "lockcont" : "7", "unlock" : "8", - "lockbkl" : "9", - "unlockbkl" : "10", "signal" : "11", "resetevent" : "98", "reset" : "99", diff --git a/scripts/rt-tester/t2-l1-2rt-sameprio.tst b/scripts/rt-tester/t2-l1-2rt-sameprio.tst index 8821f27cc8be..3710c8b2090d 100644 --- a/scripts/rt-tester/t2-l1-2rt-sameprio.tst +++ b/scripts/rt-tester/t2-l1-2rt-sameprio.tst @@ -19,8 +19,6 @@ # lockintnowait lock nr (0-7) # lockcont lock nr (0-7) # unlock lock nr (0-7) -# lockbkl lock nr (0-7) -# unlockbkl lock nr (0-7) # signal 0 # reset 0 # resetevent 0 @@ -39,9 +37,6 @@ # blocked lock nr (0-7) # blockedwake lock nr (0-7) # unlocked lock nr (0-7) -# lockedbkl dont care -# blockedbkl dont care -# unlockedbkl dont care # opcodeeq command opcode or number # opcodelt number # opcodegt number diff --git a/scripts/rt-tester/t2-l1-pi.tst b/scripts/rt-tester/t2-l1-pi.tst index cde1f189a02b..b4cc95975adb 100644 --- a/scripts/rt-tester/t2-l1-pi.tst +++ b/scripts/rt-tester/t2-l1-pi.tst @@ -19,8 +19,6 @@ # lockintnowait lock nr (0-7) # lockcont lock nr (0-7) # unlock lock nr (0-7) -# lockbkl lock nr (0-7) -# unlockbkl lock nr (0-7) # signal 0 # reset 0 # resetevent 0 @@ -39,9 +37,6 @@ # blocked lock nr (0-7) # blockedwake lock nr (0-7) # unlocked lock nr (0-7) -# lockedbkl dont care -# blockedbkl dont care -# unlockedbkl dont care # opcodeeq command opcode or number # opcodelt number # opcodegt number diff --git a/scripts/rt-tester/t2-l1-signal.tst b/scripts/rt-tester/t2-l1-signal.tst index 3ab0bfc49950..1b57376cc1f7 100644 --- a/scripts/rt-tester/t2-l1-signal.tst +++ b/scripts/rt-tester/t2-l1-signal.tst @@ -19,8 +19,6 @@ # lockintnowait lock nr (0-7) # lockcont lock nr (0-7) # unlock lock nr (0-7) -# lockbkl lock nr (0-7) -# unlockbkl lock nr (0-7) # signal 0 # reset 0 # resetevent 0 @@ -39,9 +37,6 @@ # blocked lock nr (0-7) # blockedwake lock nr (0-7) # unlocked lock nr (0-7) -# lockedbkl dont care -# blockedbkl dont care -# unlockedbkl dont care # opcodeeq command opcode or number # opcodelt number # opcodegt number diff --git a/scripts/rt-tester/t2-l2-2rt-deadlock.tst b/scripts/rt-tester/t2-l2-2rt-deadlock.tst index f4b5d5d6215f..68b10629b6f4 100644 --- a/scripts/rt-tester/t2-l2-2rt-deadlock.tst +++ b/scripts/rt-tester/t2-l2-2rt-deadlock.tst @@ -19,8 +19,6 @@ # lockintnowait lock nr (0-7) # lockcont lock nr (0-7) # unlock lock nr (0-7) -# lockbkl lock nr (0-7) -# unlockbkl lock nr (0-7) # signal 0 # reset 0 # resetevent 0 @@ -39,9 +37,6 @@ # blocked lock nr (0-7) # blockedwake lock nr (0-7) # unlocked lock nr (0-7) -# lockedbkl dont care -# blockedbkl dont care -# unlockedbkl dont care # opcodeeq command opcode or number # opcodelt number # opcodegt number diff --git a/scripts/rt-tester/t3-l1-pi-1rt.tst b/scripts/rt-tester/t3-l1-pi-1rt.tst index 63440ca2cce9..8e6c8b11ae56 100644 --- a/scripts/rt-tester/t3-l1-pi-1rt.tst +++ b/scripts/rt-tester/t3-l1-pi-1rt.tst @@ -19,8 +19,6 @@ # lockintnowait lock nr (0-7) # lockcont lock nr (0-7) # unlock lock nr (0-7) -# lockbkl lock nr (0-7) -# unlockbkl lock nr (0-7) # signal thread to signal (0-7) # reset 0 # resetevent 0 @@ -39,9 +37,6 @@ # blocked lock nr (0-7) # blockedwake lock nr (0-7) # unlocked lock nr (0-7) -# lockedbkl dont care -# blockedbkl dont care -# unlockedbkl dont care # opcodeeq command opcode or number # opcodelt number # opcodegt number diff --git a/scripts/rt-tester/t3-l1-pi-2rt.tst b/scripts/rt-tester/t3-l1-pi-2rt.tst index e5816fe67df3..69c2212fc520 100644 --- a/scripts/rt-tester/t3-l1-pi-2rt.tst +++ b/scripts/rt-tester/t3-l1-pi-2rt.tst @@ -19,8 +19,6 @@ # lockintnowait lock nr (0-7) # lockcont lock nr (0-7) # unlock lock nr (0-7) -# lockbkl lock nr (0-7) -# unlockbkl lock nr (0-7) # signal thread to signal (0-7) # reset 0 # resetevent 0 @@ -39,9 +37,6 @@ # blocked lock nr (0-7) # blockedwake lock nr (0-7) # unlocked lock nr (0-7) -# lockedbkl dont care -# blockedbkl dont care -# unlockedbkl dont care # opcodeeq command opcode or number # opcodelt number # opcodegt number diff --git a/scripts/rt-tester/t3-l1-pi-3rt.tst b/scripts/rt-tester/t3-l1-pi-3rt.tst index 718b82b5d3bb..9b0f1eb26a88 100644 --- a/scripts/rt-tester/t3-l1-pi-3rt.tst +++ b/scripts/rt-tester/t3-l1-pi-3rt.tst @@ -19,8 +19,6 @@ # lockintnowait lock nr (0-7) # lockcont lock nr (0-7) # unlock lock nr (0-7) -# lockbkl lock nr (0-7) -# unlockbkl lock nr (0-7) # signal thread to signal (0-7) # reset 0 # resetevent 0 @@ -39,9 +37,6 @@ # blocked lock nr (0-7) # blockedwake lock nr (0-7) # unlocked lock nr (0-7) -# lockedbkl dont care -# blockedbkl dont care -# unlockedbkl dont care # opcodeeq command opcode or number # opcodelt number # opcodegt number diff --git a/scripts/rt-tester/t3-l1-pi-signal.tst b/scripts/rt-tester/t3-l1-pi-signal.tst index c6e213563498..39ec74ab06ee 100644 --- a/scripts/rt-tester/t3-l1-pi-signal.tst +++ b/scripts/rt-tester/t3-l1-pi-signal.tst @@ -19,8 +19,6 @@ # lockintnowait lock nr (0-7) # lockcont lock nr (0-7) # unlock lock nr (0-7) -# lockbkl lock nr (0-7) -# unlockbkl lock nr (0-7) # signal thread to signal (0-7) # reset 0 # resetevent 0 @@ -39,9 +37,6 @@ # blocked lock nr (0-7) # blockedwake lock nr (0-7) # unlocked lock nr (0-7) -# lockedbkl dont care -# blockedbkl dont care -# unlockedbkl dont care # opcodeeq command opcode or number # opcodelt number # opcodegt number diff --git a/scripts/rt-tester/t3-l1-pi-steal.tst b/scripts/rt-tester/t3-l1-pi-steal.tst index f53749d59d79..e03db7e010fa 100644 --- a/scripts/rt-tester/t3-l1-pi-steal.tst +++ b/scripts/rt-tester/t3-l1-pi-steal.tst @@ -19,8 +19,6 @@ # lockintnowait lock nr (0-7) # lockcont lock nr (0-7) # unlock lock nr (0-7) -# lockbkl lock nr (0-7) -# unlockbkl lock nr (0-7) # signal thread to signal (0-7) # reset 0 # resetevent 0 @@ -39,9 +37,6 @@ # blocked lock nr (0-7) # blockedwake lock nr (0-7) # unlocked lock nr (0-7) -# lockedbkl dont care -# blockedbkl dont care -# unlockedbkl dont care # opcodeeq command opcode or number # opcodelt number # opcodegt number diff --git a/scripts/rt-tester/t3-l2-pi.tst b/scripts/rt-tester/t3-l2-pi.tst index cdc3e4fd7bac..7b59100d3e48 100644 --- a/scripts/rt-tester/t3-l2-pi.tst +++ b/scripts/rt-tester/t3-l2-pi.tst @@ -19,8 +19,6 @@ # lockintnowait lock nr (0-7) # lockcont lock nr (0-7) # unlock lock nr (0-7) -# lockbkl lock nr (0-7) -# unlockbkl lock nr (0-7) # signal thread to signal (0-7) # reset 0 # resetevent 0 @@ -39,9 +37,6 @@ # blocked lock nr (0-7) # blockedwake lock nr (0-7) # unlocked lock nr (0-7) -# lockedbkl dont care -# blockedbkl dont care -# unlockedbkl dont care # opcodeeq command opcode or number # opcodelt number # opcodegt number diff --git a/scripts/rt-tester/t4-l2-pi-deboost.tst b/scripts/rt-tester/t4-l2-pi-deboost.tst index baa14137f473..2f0e049d6443 100644 --- a/scripts/rt-tester/t4-l2-pi-deboost.tst +++ b/scripts/rt-tester/t4-l2-pi-deboost.tst @@ -19,8 +19,6 @@ # lockintnowait lock nr (0-7) # lockcont lock nr (0-7) # unlock lock nr (0-7) -# lockbkl lock nr (0-7) -# unlockbkl lock nr (0-7) # signal thread to signal (0-7) # reset 0 # resetevent 0 @@ -39,9 +37,6 @@ # blocked lock nr (0-7) # blockedwake lock nr (0-7) # unlocked lock nr (0-7) -# lockedbkl dont care -# blockedbkl dont care -# unlockedbkl dont care # opcodeeq command opcode or number # opcodelt number # opcodegt number diff --git a/scripts/rt-tester/t5-l4-pi-boost-deboost-setsched.tst b/scripts/rt-tester/t5-l4-pi-boost-deboost-setsched.tst index e6ec0c81b54d..04f4034ff895 100644 --- a/scripts/rt-tester/t5-l4-pi-boost-deboost-setsched.tst +++ b/scripts/rt-tester/t5-l4-pi-boost-deboost-setsched.tst @@ -19,8 +19,6 @@ # lockintnowait lock nr (0-7) # lockcont lock nr (0-7) # unlock lock nr (0-7) -# lockbkl lock nr (0-7) -# unlockbkl lock nr (0-7) # signal thread to signal (0-7) # reset 0 # resetevent 0 @@ -39,9 +37,6 @@ # blocked lock nr (0-7) # blockedwake lock nr (0-7) # unlocked lock nr (0-7) -# lockedbkl dont care -# blockedbkl dont care -# unlockedbkl dont care # opcodeeq command opcode or number # opcodelt number # opcodegt number diff --git a/scripts/rt-tester/t5-l4-pi-boost-deboost.tst b/scripts/rt-tester/t5-l4-pi-boost-deboost.tst index ca64f8bbf4bc..a48a6ee29ddc 100644 --- a/scripts/rt-tester/t5-l4-pi-boost-deboost.tst +++ b/scripts/rt-tester/t5-l4-pi-boost-deboost.tst @@ -19,8 +19,6 @@ # lockintnowait lock nr (0-7) # lockcont lock nr (0-7) # unlock lock nr (0-7) -# lockbkl lock nr (0-7) -# unlockbkl lock nr (0-7) # signal thread to signal (0-7) # reset 0 # resetevent 0 @@ -39,9 +37,6 @@ # blocked lock nr (0-7) # blockedwake lock nr (0-7) # unlocked lock nr (0-7) -# lockedbkl dont care -# blockedbkl dont care -# unlockedbkl dont care # opcodeeq command opcode or number # opcodelt number # opcodegt number From fd4afaf33313d94f548cb09129ecba3dbab62931 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 17 Feb 2011 13:39:05 +0000 Subject: [PATCH 415/706] genirq: Streamline kernel/irq/Kconfig "def_bool n" without prompt is pointless, these should be just "bool". [ tglx: Adapted to latest changes ] Signed-off-by: Jan Beulich LKML-Reference: <4D5D3309020000780003264A@vpn.id2.novell.com> Signed-off-by: Thomas Gleixner --- kernel/irq/Kconfig | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 9149be729e45..355b8c7957f5 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -1,5 +1,5 @@ config HAVE_GENERIC_HARDIRQS - def_bool n + bool if HAVE_GENERIC_HARDIRQS menu "IRQ subsystem" @@ -11,32 +11,32 @@ config GENERIC_HARDIRQS # Select this to disable the deprecated stuff config GENERIC_HARDIRQS_NO_DEPRECATED - def_bool n + bool config GENERIC_HARDIRQS_NO_COMPAT - def_bool n + bool # Options selectable by the architecture code config HAVE_SPARSE_IRQ - def_bool n + bool config GENERIC_IRQ_PROBE - def_bool n + bool config GENERIC_IRQ_SHOW - def_bool n + bool config GENERIC_PENDING_IRQ - def_bool n + bool config AUTO_IRQ_AFFINITY - def_bool n + bool config HARDIRQS_SW_RESEND - def_bool n + bool config IRQ_PREFLOW_FASTEOI - def_bool n + bool config SPARSE_IRQ bool "Support sparse irq numbering" From 540089798d2ae115fc9bff7ed3823c8c32249607 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Feb 2011 09:50:15 +0100 Subject: [PATCH 416/706] x86: OLPC: Remove redundant !X64_64 config dependency OLPC is under if X86_32 already. Signed-off-by: Thomas Gleixner Cc: Andres Salomon --- arch/x86/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d5ed94d30aad..e327f96b8805 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2068,7 +2068,7 @@ config OLPC bool "One Laptop Per Child support" select GPIOLIB select OLPC_OPENFIRMWARE - depends on !X86_64 && !X86_PAE + depends on !X86_PAE ---help--- Add support for detecting the unique features of the OLPC XO hardware. @@ -2081,7 +2081,7 @@ config OLPC_XO1 config OLPC_OPENFIRMWARE bool "Support for OLPC's Open Firmware" - depends on !X86_64 && !X86_PAE + depends on !X86_PAE default n select OF help From fe239545a1eed57a60c5d4063f0b56f6cd1811ff Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Feb 2011 10:05:53 +0100 Subject: [PATCH 417/706] x86: OLPC: Hide OLPC_OPENFIRMWARE config switch OLPC selects OLPC_OPENFIRMWARE unconditionally. If OLPC=n then the OLPC_OPENFIRMWARE functionality is pointless. Signed-off-by: Thomas Gleixner Cc: Andres Salomon --- arch/x86/Kconfig | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e327f96b8805..dbc10fce6665 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2080,18 +2080,12 @@ config OLPC_XO1 Add support for non-essential features of the OLPC XO-1 laptop. config OLPC_OPENFIRMWARE - bool "Support for OLPC's Open Firmware" - depends on !X86_PAE - default n + bool select OF - help - This option adds support for the implementation of Open Firmware - that is used on the OLPC XO-1 Children's Machine. - If unsure, say N here. + select OLPC_OPENFIRMWARE_DT if PROC_DEVICETREE config OLPC_OPENFIRMWARE_DT bool - default y if OLPC_OPENFIRMWARE && PROC_DEVICETREE select OF_PROMTREE endif # X86_32 From dc3119e700216a70e82fe07a79f1618852058354 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Feb 2011 10:08:31 +0100 Subject: [PATCH 418/706] x86: OLPC: Cleanup config maze completely Neither CONFIG_OLPC_OPENFIRMWARE nor CONFIG_OLPC_OPENFIRMWARE_DT are really necessary. OLPC selects OLPC_OPENFIRMWARE unconditionally, so move the "select OF" part under OLPC config option and fixup the dependencies in Makefiles and code. Signed-off-by: Thomas Gleixner Cc: Andres Salomon --- arch/x86/Kconfig | 10 +++------- arch/x86/include/asm/olpc_ofw.h | 11 ----------- arch/x86/kernel/head_32.S | 2 +- arch/x86/platform/olpc/Makefile | 2 +- 4 files changed, 5 insertions(+), 20 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index dbc10fce6665..677501d061a3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2066,9 +2066,10 @@ config SCx200HR_TIMER config OLPC bool "One Laptop Per Child support" - select GPIOLIB - select OLPC_OPENFIRMWARE depends on !X86_PAE + select GPIOLIB + select OF + select OLPC_OPENFIRMWARE_DT if PROC_DEVICETREE ---help--- Add support for detecting the unique features of the OLPC XO hardware. @@ -2079,11 +2080,6 @@ config OLPC_XO1 ---help--- Add support for non-essential features of the OLPC XO-1 laptop. -config OLPC_OPENFIRMWARE - bool - select OF - select OLPC_OPENFIRMWARE_DT if PROC_DEVICETREE - config OLPC_OPENFIRMWARE_DT bool select OF_PROMTREE diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h index 641988efe063..1fff2de124e5 100644 --- a/arch/x86/include/asm/olpc_ofw.h +++ b/arch/x86/include/asm/olpc_ofw.h @@ -6,8 +6,6 @@ #define OLPC_OFW_SIG 0x2057464F /* aka "OFW " */ -#ifdef CONFIG_OLPC_OPENFIRMWARE - extern bool olpc_ofw_is_installed(void); /* run an OFW command by calling into the firmware */ @@ -26,15 +24,6 @@ extern void setup_olpc_ofw_pgd(void); /* check if OFW was detected during boot */ extern bool olpc_ofw_present(void); -#else /* !CONFIG_OLPC_OPENFIRMWARE */ - -static inline bool olpc_ofw_is_installed(void) { return false; } -static inline void olpc_ofw_detect(void) { } -static inline void setup_olpc_ofw_pgd(void) { } -static inline bool olpc_ofw_present(void) { return false; } - -#endif /* !CONFIG_OLPC_OPENFIRMWARE */ - #ifdef CONFIG_OLPC_OPENFIRMWARE_DT extern void olpc_dt_build_devicetree(void); #else diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 767d6c43de37..d8cc18a83260 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -137,7 +137,7 @@ ENTRY(startup_32) movsl 1: -#ifdef CONFIG_OLPC_OPENFIRMWARE +#ifdef CONFIG_OLPC /* save OFW's pgdir table for later use when calling into OFW */ movl %cr3, %eax movl %eax, pa(olpc_ofw_pgd) diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile index e797428b163b..e18e641ae7bd 100644 --- a/arch/x86/platform/olpc/Makefile +++ b/arch/x86/platform/olpc/Makefile @@ -1,4 +1,4 @@ obj-$(CONFIG_OLPC) += olpc.o obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o -obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o +obj-$(CONFIG_OLPC) += olpc_ofw.o obj-$(CONFIG_OLPC_OPENFIRMWARE_DT) += olpc_dt.o From c2a941fadb57d157d59eec424674bd0c3a28788c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Feb 2011 10:32:42 +0100 Subject: [PATCH 419/706] x86: OLPC: Remove extra OLPC_OPENFIRMWARE_DT indirection OLPC_OPENFIRMWARE_DT is just there to be selected by OLPC and selects OF_PROMTREE. So let OLPC select OF_PROMTREE and remove that extra config indirection. Fixup code and Makefile and use CONFIG_OF_PROMTREE instead. Signed-off-by: Thomas Gleixner Cc: Andres Salomon --- arch/x86/Kconfig | 6 +----- arch/x86/include/asm/olpc_ofw.h | 4 ++-- arch/x86/platform/olpc/Makefile | 2 +- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 677501d061a3..4f3d3d432bad 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2069,7 +2069,7 @@ config OLPC depends on !X86_PAE select GPIOLIB select OF - select OLPC_OPENFIRMWARE_DT if PROC_DEVICETREE + select OF_PROMTREE if PROC_DEVICETREE ---help--- Add support for detecting the unique features of the OLPC XO hardware. @@ -2080,10 +2080,6 @@ config OLPC_XO1 ---help--- Add support for non-essential features of the OLPC XO-1 laptop. -config OLPC_OPENFIRMWARE_DT - bool - select OF_PROMTREE - endif # X86_32 config AMD_NB diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h index 1fff2de124e5..28a7fec01100 100644 --- a/arch/x86/include/asm/olpc_ofw.h +++ b/arch/x86/include/asm/olpc_ofw.h @@ -24,10 +24,10 @@ extern void setup_olpc_ofw_pgd(void); /* check if OFW was detected during boot */ extern bool olpc_ofw_present(void); -#ifdef CONFIG_OLPC_OPENFIRMWARE_DT +#ifdef CONFIG_OF_PROMTREE extern void olpc_dt_build_devicetree(void); #else static inline void olpc_dt_build_devicetree(void) { } -#endif /* CONFIG_OLPC_OPENFIRMWARE_DT */ +#endif #endif /* _ASM_X86_OLPC_OFW_H */ diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile index e18e641ae7bd..c2a8cab65e5d 100644 --- a/arch/x86/platform/olpc/Makefile +++ b/arch/x86/platform/olpc/Makefile @@ -1,4 +1,4 @@ obj-$(CONFIG_OLPC) += olpc.o obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o obj-$(CONFIG_OLPC) += olpc_ofw.o -obj-$(CONFIG_OLPC_OPENFIRMWARE_DT) += olpc_dt.o +obj-$(CONFIG_OF_PROMTREE) += olpc_dt.o From 6435a5e39d3e01a1a73a925ed53ee18619b0a368 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 23 Feb 2011 07:25:02 -0300 Subject: [PATCH 420/706] perf top browser: Adjust the browser indexes when refreshing This is not a problem when we're not at the bottom of the active symbols list, so was not noticed, but at the end of the screen it falls apart. Fix it by adjusting the ui_browser indexes according to the new number of entries in the rb_tree and by seeking from the start of the rb_tree to find the new symbol at the top of the screen. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/ui/browsers/top.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tools/perf/util/ui/browsers/top.c b/tools/perf/util/ui/browsers/top.c index 377ff58c9ae9..2f47224426b6 100644 --- a/tools/perf/util/ui/browsers/top.c +++ b/tools/perf/util/ui/browsers/top.c @@ -70,6 +70,7 @@ static void perf_top_browser__write(struct ui_browser *browser, void *entry, int static void perf_top_browser__update_rb_tree(struct perf_top_browser *browser) { struct perf_top *top = browser->b.priv; + u64 top_idx = browser->b.top_idx; browser->root = RB_ROOT; browser->b.top = NULL; @@ -82,7 +83,29 @@ static void perf_top_browser__update_rb_tree(struct perf_top_browser *browser) if (browser->sym_width + browser->dso_width > browser->b.width - 29) browser->sym_width = browser->b.width - browser->dso_width - 29; } + + /* + * Adjust the ui_browser indexes since the entries in the browser->root + * rb_tree may have changed, then seek it from start, so that we get a + * possible new top of the screen. + */ browser->b.nr_entries = top->rb_entries; + + if (top_idx >= browser->b.nr_entries) { + if (browser->b.height >= browser->b.nr_entries) + top_idx = browser->b.nr_entries - browser->b.height; + else + top_idx = 0; + } + + if (browser->b.index >= top_idx + browser->b.height) + browser->b.index = top_idx + browser->b.index - browser->b.top_idx; + + if (browser->b.index >= browser->b.nr_entries) + browser->b.index = browser->b.nr_entries - 1; + + browser->b.top_idx = top_idx; + browser->b.seek(&browser->b, top_idx, SEEK_SET); } static void perf_top_browser__annotate(struct perf_top_browser *browser) From 9826e8329bc160e4cc58b83019f3f056965e42d0 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Tue, 22 Feb 2011 21:53:12 +0100 Subject: [PATCH 421/706] perf lock: Document valid sort keys Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra LKML-Reference: <20110222205312.GA18474@joi.lan> Signed-off-by: Marcin Slusarz Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-lock.txt | 12 ++++++++++-- tools/perf/builtin-lock.c | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/tools/perf/Documentation/perf-lock.txt b/tools/perf/Documentation/perf-lock.txt index 921de259ea10..4a26a2f3a6a3 100644 --- a/tools/perf/Documentation/perf-lock.txt +++ b/tools/perf/Documentation/perf-lock.txt @@ -24,8 +24,8 @@ and statistics with this 'perf lock' command. 'perf lock report' reports statistical data. -OPTIONS -------- +COMMON OPTIONS +-------------- -i:: --input=:: @@ -39,6 +39,14 @@ OPTIONS --dump-raw-trace:: Dump raw trace in ASCII. +REPORT OPTIONS +-------------- + +-k:: +--key=:: + Sorting key. Possible values: acquired (default), contended, + wait_total, wait_max, wait_min. + SEE ALSO -------- linkperf:perf[1] diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index e00d93847c44..2e93f99b1480 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -893,7 +893,7 @@ static const char * const report_usage[] = { static const struct option report_options[] = { OPT_STRING('k', "key", &sort_key, "acquired", - "key for sorting"), + "key for sorting (acquired / contended / wait_total / wait_max / wait_min)"), /* TODO: type */ OPT_END() }; From c186fafe9aba87c1a93df8c7120a6ae01fe435ad Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 21 Feb 2011 18:52:53 +0100 Subject: [PATCH 422/706] sched: Clean up remnants of sd_idle With the wholesale removal of the sd_idle SMT logic we can clean up some more. Signed-off-by: Peter Zijlstra Cc: Nikhil Rao Cc: Venkatesh Pallipadi Cc: Suresh Siddha Cc: Mike Galbraith LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index d384e739ea95..cd18600a8a63 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3150,25 +3150,23 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (sds.this_load >= sds.avg_load) goto out_balanced; - /* - * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative. - * And to check for busy balance use !idle_cpu instead of - * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE - * even when they are idle. - */ - if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) { - if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) - goto out_balanced; - } else { + if (idle == CPU_IDLE) { /* * This cpu is idle. If the busiest group load doesn't * have more tasks than the number of available cpu's and * there is no imbalance between this and busiest group * wrt to idle cpu's, it is balanced. */ - if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && + if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && sds.busiest_nr_running <= sds.busiest_group_weight) goto out_balanced; + } else { + /* + * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use + * imbalance_pct to be conservative. + */ + if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) + goto out_balanced; } force_balance: @@ -3862,8 +3860,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) if (load_balance(cpu, rq, sd, idle, &balance)) { /* * We've pulled tasks over so either we're no - * longer idle, or one of our SMT siblings is - * not idle. + * longer idle. */ idle = CPU_NOT_IDLE; } From cc57aa8f4b3bece8c26c7929728edcc5fa6b5aed Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 21 Feb 2011 18:55:32 +0100 Subject: [PATCH 423/706] sched: Clean up some f_b_g() comments The existing comment tends to grow state (as it already has), split it up and place it near the actual tests. Signed-off-by: Peter Zijlstra Cc: Nikhil Rao Cc: Venkatesh Pallipadi Cc: Suresh Siddha Cc: Mike Galbraith LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index cd18600a8a63..03496ebc4553 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3113,19 +3113,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, */ update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); - /* Cases where imbalance does not exist from POV of this_cpu */ - /* 1) this_cpu is not the appropriate cpu to perform load balancing - * at this level. - * 2) There is no busy sibling group to pull from. - * 3) This group is the busiest group. - * 4) This group is more busy than the avg busieness at this - * sched_domain. - * 5) The imbalance is within the specified limit. - * - * Note: when doing newidle balance, if the local group has excess - * capacity (i.e. nr_running < group_capacity) and the busiest group - * does not have any capacity, we force a load balance to pull tasks - * to the local group. In this case, we skip past checks 3, 4 and 5. + /* + * this_cpu is not the appropriate cpu to perform load balancing at + * this level. */ if (!(*balance)) goto ret; @@ -3134,19 +3124,27 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, check_asym_packing(sd, &sds, this_cpu, imbalance)) return sds.busiest; + /* There is no busy sibling group to pull tasks from */ if (!sds.busiest || sds.busiest_nr_running == 0) goto out_balanced; - /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ + /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && !sds.busiest_has_capacity) goto force_balance; + /* + * If the local group is more busy than the selected busiest group + * don't try and pull any tasks. + */ if (sds.this_load >= sds.max_load) goto out_balanced; + /* + * Don't pull any tasks if this group is already above the domain + * average load. + */ sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; - if (sds.this_load >= sds.avg_load) goto out_balanced; From 866ab43efd325fae8889ea77a744d03f2b957e38 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 21 Feb 2011 18:56:47 +0100 Subject: [PATCH 424/706] sched: Fix the group_imb logic On a 2*6*2 machine something like: taskset -c 3-11 bash -c 'for ((i=0;i<9;i++)) do while :; do :; done & done' _should_ result in 9 busy CPUs, each running 1 task. However it didn't quite work reliably, most of the time one cpu of the second socket (6-11) would be idle and one cpu of the first socket (0-5) would have two tasks on it. The group_imb logic is supposed to deal with this and detect when a particular group is imbalanced (like in our case, 0-2 are idle but 3-5 will have 4 tasks on it). The detection phase needed a bit of a tweak as it was too weak and required more than 2 avg weight tasks difference between idle and busy cpus in the group which won't trigger for our test-case. So cure that to be one or more avg task weight difference between cpus. Once the detection phase worked, it was then defeated by the f_b_g() tests trying to avoid ping-pongs. In particular, this_load >= max_load triggered because the pulling cpu (the (first) idle cpu in on the second socket, say 6) would find this_load to be 5 and max_load to be 4 (there'd be 5 tasks running on our socket and only 4 on the other socket). Signed-off-by: Peter Zijlstra Cc: Nikhil Rao Cc: Venkatesh Pallipadi Cc: Suresh Siddha Cc: Mike Galbraith LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 03496ebc4553..3a88dee165c0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2743,7 +2743,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, /* * Consider the group unbalanced when the imbalance is larger - * than the average weight of two tasks. + * than the average weight of a task. * * APZ: with cgroup the avg task weight can vary wildly and * might not be a suitable number - should we keep a @@ -2753,7 +2753,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, if (sgs->sum_nr_running) avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; - if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1) + if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) sgs->group_imb = 1; sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); @@ -3128,6 +3128,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (!sds.busiest || sds.busiest_nr_running == 0) goto out_balanced; + /* + * If the busiest group is imbalanced the below checks don't + * work because they assumes all things are equal, which typically + * isn't true due to cpus_allowed constraints and the like. + */ + if (sds.group_imb) + goto force_balance; + /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && !sds.busiest_has_capacity) From 1747b21fecbfb63fbf6b9624e8b92707960d5a97 Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Sun, 20 Feb 2011 15:08:12 +0800 Subject: [PATCH 425/706] sched, autogroup, sysctl: Use proc_dointvec_minmax() instead sched_autogroup_enabled has min/max value, proc_dointvec_minmax() is be used for this case. Signed-off-by: Yong Zhang Cc: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1298185696-4403-2-git-send-email-yong.zhang0@gmail.com> Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cb7c830f7faa..7b5eeadfb254 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -367,7 +367,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_autogroup_enabled, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, From 800d4d30c8f20bd728e5741a3b77c4859a613f7c Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Sun, 20 Feb 2011 15:08:14 +0800 Subject: [PATCH 426/706] sched, autogroup: Stop going ahead if autogroup is disabled when autogroup is disable from the beginning, sched_autogroup_create_attach() autogroup_move_group() <== 1 sched_move_task() <== 2 task_move_group_fair() set_task_rq() task_group() autogroup_task_group() We go the whole path without doing anything useful. Then stop going further if autogroup is disabled. But there will be a race window between 1 and 2, in which sysctl_sched_autogroup_enabled is enabled. This issue will be toke by following patch. Signed-off-by: Yong Zhang Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <1298185696-4403-4-git-send-email-yong.zhang0@gmail.com> Signed-off-by: Ingo Molnar --- kernel/sched_autogroup.c | 4 ++++ kernel/sched_autogroup.h | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c index 9fb656283157..137a096ae9d8 100644 --- a/kernel/sched_autogroup.c +++ b/kernel/sched_autogroup.c @@ -161,11 +161,15 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) p->signal->autogroup = autogroup_kref_get(ag); + if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) + goto out; + t = p; do { sched_move_task(t); } while_each_thread(p, t); +out: unlock_task_sighand(p, &flags); autogroup_kref_put(prev); } diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h index 7b859ffe5dad..05577055cfca 100644 --- a/kernel/sched_autogroup.h +++ b/kernel/sched_autogroup.h @@ -1,6 +1,11 @@ #ifdef CONFIG_SCHED_AUTOGROUP struct autogroup { + /* + * reference doesn't mean how many thread attach to this + * autogroup now. It just stands for the number of task + * could use this autogroup. + */ struct kref kref; struct task_group *tg; struct rw_semaphore lock; From 511f67a5997c4967c69a3961e2fc9f04d8d244ac Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 22 Feb 2011 15:02:00 +0100 Subject: [PATCH 427/706] sched, autogroup: Stop claiming ownership of the root task group Disown it, and only display autogroup association if one exists. Signed-off-by: Mike Galbraith Reviewed-by: Yong Zhang Signed-off-by: Peter Zijlstra LKML-Reference: <1298383320.8036.5.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched_autogroup.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c index 137a096ae9d8..5946ac515602 100644 --- a/kernel/sched_autogroup.c +++ b/kernel/sched_autogroup.c @@ -12,7 +12,6 @@ static atomic_t autogroup_seq_nr; static void __init autogroup_init(struct task_struct *init_task) { autogroup_default.tg = &root_task_group; - root_task_group.autogroup = &autogroup_default; kref_init(&autogroup_default.kref); init_rwsem(&autogroup_default.lock); init_task->signal->autogroup = &autogroup_default; @@ -130,7 +129,7 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg) static inline bool task_group_is_autogroup(struct task_group *tg) { - return tg != &root_task_group && tg->autogroup; + return !!tg->autogroup; } static inline struct task_group * @@ -251,10 +250,14 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) { struct autogroup *ag = autogroup_task_get(p); + if (!task_group_is_autogroup(ag->tg)) + goto out; + down_read(&ag->lock); seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); up_read(&ag->lock); +out: autogroup_kref_put(ag); } #endif /* CONFIG_PROC_FS */ @@ -262,9 +265,7 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) #ifdef CONFIG_SCHED_DEBUG static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) { - int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); - - if (!enabled || !tg->autogroup) + if (!task_group_is_autogroup(tg)) return 0; return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); From 3f7cce3c18188a067d463749168bdda5abc5b0f7 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Fri, 18 Feb 2011 14:40:01 +0200 Subject: [PATCH 428/706] perf_events: Fix rcu and locking issues with cgroup support This patches ensures that we do not end up calling perf_cgroup_from_task() when there is no cgroup event. This avoids potential RCU and locking issues. The change in perf_cgroup_set_timestamp() ensures we check against ctx->nr_cgroups. It also avoids calling perf_clock() tiwce in a row. It also ensures we do need to grab ctx->lock before calling the function. We drop update_cgrp_time() from task_clock_event_read() because it is not needed. This also avoids having to deal with perf_cgroup_from_task(). Thanks to Peter Zijlstra for his help on this. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <4d5e76b8.815bdf0a.7ac3.774f@mx.google.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 40 +++++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/kernel/perf_event.c b/kernel/perf_event.c index a0a6987fabc4..dadeaea4b3fc 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -201,6 +201,11 @@ __get_cpu_context(struct perf_event_context *ctx) #ifdef CONFIG_CGROUP_PERF +/* + * Must ensure cgroup is pinned (css_get) before calling + * this function. In other words, we cannot call this function + * if there is no cgroup event for the current CPU context. + */ static inline struct perf_cgroup * perf_cgroup_from_task(struct task_struct *task) { @@ -268,28 +273,41 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) static inline void update_cgrp_time_from_event(struct perf_event *event) { - struct perf_cgroup *cgrp = perf_cgroup_from_task(current); + struct perf_cgroup *cgrp; + /* - * do not update time when cgroup is not active + * ensure we access cgroup data only when needed and + * when we know the cgroup is pinned (css_get) */ - if (!event->cgrp || cgrp != event->cgrp) + if (!is_cgroup_event(event)) return; - __update_cgrp_time(event->cgrp); + cgrp = perf_cgroup_from_task(current); + /* + * Do not update time when cgroup is not active + */ + if (cgrp == event->cgrp) + __update_cgrp_time(event->cgrp); } static inline void -perf_cgroup_set_timestamp(struct task_struct *task, u64 now) +perf_cgroup_set_timestamp(struct task_struct *task, + struct perf_event_context *ctx) { struct perf_cgroup *cgrp; struct perf_cgroup_info *info; - if (!task) + /* + * ctx->lock held by caller + * ensure we do not access cgroup data + * unless we have the cgroup pinned (css_get) + */ + if (!task || !ctx->nr_cgroups) return; cgrp = perf_cgroup_from_task(task); info = this_cpu_ptr(cgrp->info); - info->timestamp = now; + info->timestamp = ctx->timestamp; } #define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ @@ -494,7 +512,8 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, } static inline void -perf_cgroup_set_timestamp(struct task_struct *task, u64 now) +perf_cgroup_set_timestamp(struct task_struct *task, + struct perf_event_context *ctx) { } @@ -1613,7 +1632,7 @@ static int __perf_event_enable(void *info) /* * set current task's cgroup time reference point */ - perf_cgroup_set_timestamp(current, perf_clock()); + perf_cgroup_set_timestamp(current, ctx); __perf_event_mark_enabled(event, ctx); @@ -2048,7 +2067,7 @@ ctx_sched_in(struct perf_event_context *ctx, now = perf_clock(); ctx->timestamp = now; - perf_cgroup_set_timestamp(task, now); + perf_cgroup_set_timestamp(task, ctx); /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. @@ -5795,7 +5814,6 @@ static void task_clock_event_read(struct perf_event *event) if (!in_nmi()) { update_context_time(event->ctx); - update_cgrp_time_from_event(event); time = event->ctx->time; } else { u64 now = perf_clock(); From 768a06e2ca49cdf72389208cfc056a36cf8bc5e3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 22 Feb 2011 16:52:24 +0100 Subject: [PATCH 429/706] perf: Simplify task_clock_event_read() There is no point in us having different code paths for nmi and !nmi here, so remove the !nmi one. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/kernel/perf_event.c b/kernel/perf_event.c index dadeaea4b3fc..64a018e94fca 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -5810,16 +5810,9 @@ static void task_clock_event_del(struct perf_event *event, int flags) static void task_clock_event_read(struct perf_event *event) { - u64 time; - - if (!in_nmi()) { - update_context_time(event->ctx); - time = event->ctx->time; - } else { - u64 now = perf_clock(); - u64 delta = now - event->ctx->timestamp; - time = event->ctx->time + delta; - } + u64 now = perf_clock(); + u64 delta = now - event->ctx->timestamp; + u64 time = event->ctx->time + delta; task_clock_event_update(event, time); } From 4e034b245133adfd006ade5d7a809c9cac4beef9 Mon Sep 17 00:00:00 2001 From: Henrik Kretzschmar Date: Tue, 22 Feb 2011 15:38:03 +0100 Subject: [PATCH 430/706] x86: Move ioapic_irq_destination_types to apicdef.h This enum is used by non IOAPIC code, so apicdef.h is the best place for it. Signed-off-by: Henrik Kretzschmar LKML-Reference: <1298385487-4708-1-git-send-email-henne@nachtwindheim.de> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apicdef.h | 12 ++++++++++++ arch/x86/include/asm/io_apic.h | 11 ----------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 47a30ff8e517..d87988bacf3e 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -426,4 +426,16 @@ struct local_apic { #else #define BAD_APICID 0xFFFFu #endif + +enum ioapic_irq_destination_types { + dest_Fixed = 0, + dest_LowestPrio = 1, + dest_SMI = 2, + dest__reserved_1 = 3, + dest_NMI = 4, + dest_INIT = 5, + dest__reserved_2 = 6, + dest_ExtINT = 7 +}; + #endif /* _ASM_X86_APICDEF_H */ diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index f327d386d6cc..e1a9b0ef43b1 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -63,17 +63,6 @@ union IO_APIC_reg_03 { } __attribute__ ((packed)) bits; }; -enum ioapic_irq_destination_types { - dest_Fixed = 0, - dest_LowestPrio = 1, - dest_SMI = 2, - dest__reserved_1 = 3, - dest_NMI = 4, - dest_INIT = 5, - dest__reserved_2 = 6, - dest_ExtINT = 7 -}; - struct IO_APIC_route_entry { __u32 vector : 8, delivery_mode : 3, /* 000: FIXED From b6a1432da81fa387d76215108dc9f6ea6d343aed Mon Sep 17 00:00:00 2001 From: Henrik Kretzschmar Date: Tue, 22 Feb 2011 15:38:04 +0100 Subject: [PATCH 431/706] x86: Add dummy mp_save_irq() This is a dummy function, used when no IOAPIC is compiled in. Signed-off-by: Henrik Kretzschmar LKML-Reference: <1298385487-4708-2-git-send-email-henne@nachtwindheim.de> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/io_apic.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index e1a9b0ef43b1..b24915fcd6df 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -188,6 +188,7 @@ static inline int mp_find_ioapic(u32 gsi) { return 0; } struct io_apic_irq_attr; static inline int io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr) { return 0; } +static inline void mp_save_irq(struct mpc_intsrc *m) { }; #endif #endif /* _ASM_X86_IO_APIC_H */ From 7167d08e780a722fa79ea414fc4e72bc00751392 Mon Sep 17 00:00:00 2001 From: Henrik Kretzschmar Date: Tue, 22 Feb 2011 15:38:05 +0100 Subject: [PATCH 432/706] x86: Rework arch_disable_smp_support() for x86 Currently arch_disable_smp_support() on x86 disables only the support for the IOAPIC and is also compiled in if SMP-support is not. Therefore this function is renamed to disable_ioapic_support(), which meets its purpose and is only compiled in the kernel when IOAPIC support is also. A new arch_disable_smp_support() is created in smpboot.c, which calls disable_ioapic_support() and gets only compiled in the kernel when SMP support is also. Signed-off-by: Henrik Kretzschmar LKML-Reference: <1298385487-4708-3-git-send-email-henne@nachtwindheim.de> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/io_apic.h | 3 +++ arch/x86/kernel/apic/apic.c | 3 ++- arch/x86/kernel/apic/io_apic.c | 7 +++++-- arch/x86/kernel/smpboot.c | 11 ++++++++++- 4 files changed, 20 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index b24915fcd6df..0be2f27b78f3 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -175,6 +175,8 @@ extern void __init pre_init_apic_IRQ0(void); extern void mp_save_irq(struct mpc_intsrc *m); +extern void disable_ioapic_support(void); + #else /* !CONFIG_X86_IO_APIC */ #define io_apic_assign_pci_irqs 0 @@ -189,6 +191,7 @@ struct io_apic_irq_attr; static inline int io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr) { return 0; } static inline void mp_save_irq(struct mpc_intsrc *m) { }; +static inline void disable_ioapic_support(void) { } #endif #endif /* _ASM_X86_IO_APIC_H */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 76b96d74978a..96e68099b06e 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -1209,7 +1210,7 @@ void __cpuinit setup_local_APIC(void) rdtscll(tsc); if (disable_apic) { - arch_disable_smp_support(); + disable_ioapic_support(); return; } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ca9e2a3545a9..a2f2bf8ab9db 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -108,7 +108,10 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); int skip_ioapic_setup; -void arch_disable_smp_support(void) +/** + * disable_ioapic_support() - disables ioapic support at runtime + */ +void disable_ioapic_support(void) { #ifdef CONFIG_PCI noioapicquirk = 1; @@ -120,7 +123,7 @@ void arch_disable_smp_support(void) static int __init parse_noapic(char *str) { /* disable IO-APIC */ - arch_disable_smp_support(); + disable_ioapic_support(); return 0; } early_param("noapic", parse_noapic); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 08776a953487..09d0172a0059 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -64,6 +64,7 @@ #include #include #include +#include #include #include #include @@ -945,6 +946,14 @@ int __cpuinit native_cpu_up(unsigned int cpu) return 0; } +/** + * arch_disable_smp_support() - disables SMP support for x86 at runtime + */ +void arch_disable_smp_support(void) +{ + disable_ioapic_support(); +} + /* * Fall back to non SMP mode after errors. * @@ -1045,7 +1054,7 @@ static int __init smp_sanity_check(unsigned max_cpus) "(tell your hw vendor)\n"); } smpboot_clear_io_apic(); - arch_disable_smp_support(); + disable_ioapic_support(); return -1; } From 7d0f1926131cf79aa5998d463bf1582156e7b41e Mon Sep 17 00:00:00 2001 From: Henrik Kretzschmar Date: Tue, 22 Feb 2011 15:38:06 +0100 Subject: [PATCH 433/706] x86: Add dummy functions for compiling without IOAPIC This patch adds IOAPIC dummy functions for compilation with local APIC, but without IOAPIC. The local variable ioapic_entries in enable_IR_x2apic() does not need initialization anymore, since the dummy returns NULL. Signed-off-by: Henrik Kretzschmar LKML-Reference: <1298385487-4708-4-git-send-email-henne@nachtwindheim.de> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/io_apic.h | 18 ++++++++++++++++++ arch/x86/kernel/apic/apic.c | 2 +- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 0be2f27b78f3..56dcf08bde62 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -190,6 +190,24 @@ static inline int mp_find_ioapic(u32 gsi) { return 0; } struct io_apic_irq_attr; static inline int io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr) { return 0; } + +static inline struct IO_APIC_route_entry **alloc_ioapic_entries(void) +{ + return NULL; +} + +static inline void free_ioapic_entries(struct IO_APIC_route_entry **ent) { } +static inline int save_IO_APIC_setup(struct IO_APIC_route_entry **ent) +{ + return -ENOMEM; +} + +static inline void mask_IO_APIC_setup(struct IO_APIC_route_entry **ent) { } +static inline int restore_IO_APIC_setup(struct IO_APIC_route_entry **ent) +{ + return -ENOMEM; +} + static inline void mp_save_irq(struct mpc_intsrc *m) { }; static inline void disable_ioapic_support(void) { } #endif diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 96e68099b06e..f0e079823c43 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1449,7 +1449,7 @@ int __init enable_IR(void) void __init enable_IR_x2apic(void) { unsigned long flags; - struct IO_APIC_route_entry **ioapic_entries = NULL; + struct IO_APIC_route_entry **ioapic_entries; int ret, x2apic_enabled = 0; int dmar_table_init_ret; From 1444e0c9daf0d3472677efc15588b192fc2db761 Mon Sep 17 00:00:00 2001 From: Henrik Kretzschmar Date: Tue, 22 Feb 2011 15:38:07 +0100 Subject: [PATCH 434/706] x86: Fix deps of X86_UP_IOAPIC Since commit 7cd92366a593246650cc7d6198e2c7d3af8c1d8a lAPIC enabled accidently the IOAPIC, which now gets fixed. Signed-off-by: Henrik Kretzschmar LKML-Reference: <1298385487-4708-5-git-send-email-henne@nachtwindheim.de> Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d5ed94d30aad..867a8cf04faa 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -811,7 +811,7 @@ config X86_LOCAL_APIC config X86_IO_APIC def_bool y - depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC + depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC config X86_VISWS_APIC def_bool y From 939d578ecc62b07efeb186576ab190fe0b766501 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Feb 2011 11:46:01 +0100 Subject: [PATCH 435/706] x86: OLPC: Make OLPC=n build again Stupid me missed the functions called from setup.c. Add the stubs back for OLPC=n Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/olpc_ofw.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h index 28a7fec01100..c5d3a5abbb9f 100644 --- a/arch/x86/include/asm/olpc_ofw.h +++ b/arch/x86/include/asm/olpc_ofw.h @@ -6,6 +6,8 @@ #define OLPC_OFW_SIG 0x2057464F /* aka "OFW " */ +#ifdef CONFIG_OLPC + extern bool olpc_ofw_is_installed(void); /* run an OFW command by calling into the firmware */ @@ -24,6 +26,11 @@ extern void setup_olpc_ofw_pgd(void); /* check if OFW was detected during boot */ extern bool olpc_ofw_present(void); +#else /* !CONFIG_OLPC */ +static inline void olpc_ofw_detect(void) { } +static inline void setup_olpc_ofw_pgd(void) { } +#endif /* !CONFIG_OLPC */ + #ifdef CONFIG_OF_PROMTREE extern void olpc_dt_build_devicetree(void); #else From 170ae6bc24e1d7f9bd921a484ec9ea2825497970 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 23 Feb 2011 11:08:59 -0300 Subject: [PATCH 436/706] perf annotate: Show better message when no vmlinux is found In both --tui and --stdio, in 'annotate', 'top', 'report' when trying to annotate a kernel symbol having just access to a kallsyms file, that doesn't have the DWARF info needed for annotation. Suggested-by: Ingo Molnar Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 15 +++++++++++++-- tools/perf/util/ui/browsers/hists.c | 8 -------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 70ec422ddb64..0d0830c98cd7 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -295,12 +295,23 @@ fallback: } if (dso->origin == DSO__ORIG_KERNEL) { + char bf[BUILD_ID_SIZE * 2 + 16] = " with build id "; + char *build_id_msg = NULL; + if (dso->annotate_warned) goto out_free_filename; + + if (dso->has_build_id) { + build_id__sprintf(dso->build_id, + sizeof(dso->build_id), bf + 15); + build_id_msg = bf; + } err = -ENOENT; dso->annotate_warned = 1; - pr_err("Can't annotate %s: No vmlinux file was found in the " - "path\n", sym->name); + pr_err("Can't annotate %s: No vmlinux file%s was found in the " + "path.\nPlease use 'perf buildid-cache -av vmlinux' or " + "--vmlinux vmlinux.\n", + sym->name, build_id_msg ?: ""); goto out_free_filename; } diff --git a/tools/perf/util/ui/browsers/hists.c b/tools/perf/util/ui/browsers/hists.c index 294b49538522..497b3c4076a3 100644 --- a/tools/perf/util/ui/browsers/hists.c +++ b/tools/perf/util/ui/browsers/hists.c @@ -924,14 +924,6 @@ int hists__browse(struct hists *self, const char *helpline, if (choice == annotate) { struct hist_entry *he; do_annotate: - if (browser->selection->map->dso->origin == DSO__ORIG_KERNEL) { - browser->selection->map->dso->annotate_warned = 1; - ui_helpline__puts("No vmlinux file found, can't " - "annotate with just a " - "kallsyms file"); - continue; - } - he = hist_browser__selected_entry(browser); if (he == NULL) continue; From c8d6b8fe72216ca47e399204b58c8be0448d4083 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Feb 2011 14:29:34 +0100 Subject: [PATCH 437/706] x86: ioapic: Remove silly debug bloat in setup_IOAPIC_irqs() This is debug code and it does not matter at all whether we print each not connected pin in an extra line or try to be extra clever. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index a2f2bf8ab9db..e33ccb45d0fa 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1388,9 +1388,19 @@ static struct { DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); } mp_ioapic_routing[MAX_IO_APICS]; +static bool __init io_apic_pin_not_connected(int idx, int apic_id, int pin) +{ + if (idx != -1) + return false; + + apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n", + mp_ioapics[apic_id].apicid, pin); + return true; +} + static void __init setup_IO_APIC_irqs(void) { - int apic_id, pin, idx, irq, notcon = 0; + int apic_id, pin, idx, irq; int node = cpu_to_node(0); struct irq_cfg *cfg; @@ -1399,22 +1409,8 @@ static void __init setup_IO_APIC_irqs(void) for (apic_id = 0; apic_id < nr_ioapics; apic_id++) for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { idx = find_irq_entry(apic_id, pin, mp_INT); - if (idx == -1) { - if (!notcon) { - notcon = 1; - apic_printk(APIC_VERBOSE, - KERN_DEBUG " %d-%d", - mp_ioapics[apic_id].apicid, pin); - } else - apic_printk(APIC_VERBOSE, " %d-%d", - mp_ioapics[apic_id].apicid, pin); + if (io_apic_pin_not_connected(idx, apic_id, pin)) continue; - } - if (notcon) { - apic_printk(APIC_VERBOSE, - " (apicid-pin) not connected\n"); - notcon = 0; - } irq = pin_2_irq(idx, apic_id, pin); @@ -1441,10 +1437,6 @@ static void __init setup_IO_APIC_irqs(void) setup_ioapic_irq(apic_id, pin, irq, cfg, irq_trigger(idx), irq_polarity(idx)); } - - if (notcon) - apic_printk(APIC_VERBOSE, - " (apicid-pin) not connected\n"); } /* From ed972ccf434a9881a5881915ae04602af2776bad Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Feb 2011 14:31:36 +0100 Subject: [PATCH 438/706] x86: ioapic: Split out the nested loop in setup_IO_APIC_irqs() Two consecutive for(...) for(...) lines to avoid an extra indentation are just horrible to read. I had to look more than once to figure out what the code is doing. Split out the inner loop into a separate function. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e33ccb45d0fa..f751a82d4cb4 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1398,15 +1398,12 @@ static bool __init io_apic_pin_not_connected(int idx, int apic_id, int pin) return true; } -static void __init setup_IO_APIC_irqs(void) +static void __init __io_apic_setup_irqs(unsigned int apic_id) { - int apic_id, pin, idx, irq; - int node = cpu_to_node(0); + int idx, node = cpu_to_node(0); + unsigned int pin, irq; struct irq_cfg *cfg; - apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); - - for (apic_id = 0; apic_id < nr_ioapics; apic_id++) for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { idx = find_irq_entry(apic_id, pin, mp_INT); if (io_apic_pin_not_connected(idx, apic_id, pin)) @@ -1439,6 +1436,16 @@ static void __init setup_IO_APIC_irqs(void) } } +static void __init setup_IO_APIC_irqs(void) +{ + unsigned int apic_id; + + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); + + for (apic_id = 0; apic_id < nr_ioapics; apic_id++) + __io_apic_setup_irqs(apic_id); +} + /* * for the gsit that is not in first ioapic * but could not use acpi_register_gsi() From ff973d041e5ab9ada9e49f4e93ef3a699c511463 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Feb 2011 13:00:56 +0100 Subject: [PATCH 439/706] x86: ioapic: Add io_apic_setup_irq_pin() There are about four places in the ioapic code which do exactly the same setup sequence. Also the OF based ioapic setup needs that function to avoid putting the OF specific code into ioapic.c Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 2 ++ arch/x86/kernel/apic/io_apic.c | 15 +++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 56dcf08bde62..37dbb3fae395 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -151,6 +151,8 @@ void setup_IO_APIC_irq_extra(u32 gsi); extern void ioapic_and_gsi_init(void); extern void ioapic_insert_resources(void); +int io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr); + extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries); extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index f751a82d4cb4..6deb3ca62fd6 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3607,6 +3607,21 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) } #endif /* CONFIG_HT_IRQ */ +int +io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr) +{ + struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node); + int ret; + + if (!cfg) + return -EINVAL; + ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin); + if (!ret) + setup_ioapic_irq(attr->ioapic, attr->ioapic_pin, irq, cfg, + attr->trigger, attr->polarity); + return ret; +} + int __init io_apic_get_redir_entries (int ioapic) { union IO_APIC_reg_01 reg_01; From f880ec78fabebc58180778d223600e9be7b48502 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Feb 2011 13:07:54 +0100 Subject: [PATCH 440/706] x86: ioapic: Use new setup function in pre_init_apic_IRQ0() Remove the duplicated code and call the function. It does not matter whether we allocated the cfg before calling setup_local_APIC() and we can set the irq chip and handler after that as well. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 6deb3ca62fd6..51c8bd11c136 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -4103,20 +4103,15 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) /* Enable IOAPIC early just for system timer */ void __init pre_init_apic_IRQ0(void) { - struct irq_cfg *cfg; + struct io_apic_irq_attr attr = { 0, 0, 0, 0 }; printk(KERN_INFO "Early APIC setup for system timer0\n"); #ifndef CONFIG_SMP physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); #endif - /* Make sure the irq descriptor is set up */ - cfg = alloc_irq_and_cfg_at(0, 0); - setup_local_APIC(); - add_pin_to_irq_node(cfg, 0, 0, 0); + io_apic_setup_irq_pin(0, 0, &attr); set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); - - setup_ioapic_irq(0, 0, 0, cfg, 0, 0); } From e0799c04b2080e0832538a911361f962c93fb744 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Feb 2011 14:10:54 +0100 Subject: [PATCH 441/706] x86: ioapic: Use setup function in __io_apic_set_pci_routing() The only difference here is that we did not call __add_pin_to_irq_node() for the legacy irqs, but that's not worth 30 lines of extra code. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 34 +++------------------------------- 1 file changed, 3 insertions(+), 31 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 51c8bd11c136..a655bd8fb063 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3679,45 +3679,17 @@ int __init arch_probe_nr_irqs(void) static int __io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr) { - struct irq_cfg *cfg; int node; - int ioapic, pin; - int trigger, polarity; - ioapic = irq_attr->ioapic; if (!IO_APIC_IRQ(irq)) { apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", - ioapic); + irq_attr->ioapic); return -EINVAL; } - if (dev) - node = dev_to_node(dev); - else - node = cpu_to_node(0); + node = dev ? dev_to_node(dev) : cpu_to_node(0); - cfg = alloc_irq_and_cfg_at(irq, node); - if (!cfg) - return 0; - - pin = irq_attr->ioapic_pin; - trigger = irq_attr->trigger; - polarity = irq_attr->polarity; - - /* - * IRQs < 16 are already in the irq_2_pin[] map - */ - if (irq >= legacy_pic->nr_legacy_irqs) { - if (__add_pin_to_irq_node(cfg, node, ioapic, pin)) { - printk(KERN_INFO "can not add pin %d for irq %d\n", - pin, irq); - return 0; - } - } - - setup_ioapic_irq(ioapic, pin, irq, cfg, trigger, polarity); - - return 0; + return io_apic_setup_irq_pin(irq, node, irq_attr); } int io_apic_set_pci_routing(struct device *dev, int irq, From 2d57e37dbf648fd6547752b8954f4104a85f4b15 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Feb 2011 14:40:35 +0100 Subject: [PATCH 442/706] x86: ioapic: Use setup function in __io_apic_setup_irqs() Replace the duplicated code. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index a655bd8fb063..63140d86759f 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1401,8 +1401,8 @@ static bool __init io_apic_pin_not_connected(int idx, int apic_id, int pin) static void __init __io_apic_setup_irqs(unsigned int apic_id) { int idx, node = cpu_to_node(0); + struct io_apic_irq_attr attr; unsigned int pin, irq; - struct irq_cfg *cfg; for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { idx = find_irq_entry(apic_id, pin, mp_INT); @@ -1419,20 +1419,13 @@ static void __init __io_apic_setup_irqs(unsigned int apic_id) * installed and if it returns 1: */ if (apic->multi_timer_check && - apic->multi_timer_check(apic_id, irq)) + apic->multi_timer_check(apic_id, irq)) continue; - cfg = alloc_irq_and_cfg_at(irq, node); - if (!cfg) - continue; + set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx), + irq_polarity(idx)); - add_pin_to_irq_node(cfg, node, apic_id, pin); - /* - * don't mark it in pin_programmed, so later acpi could - * set it correctly when irq < 16 - */ - setup_ioapic_irq(apic_id, pin, irq, cfg, irq_trigger(idx), - irq_polarity(idx)); + io_apic_setup_irq_pin(irq, node, &attr); } } From da1ad9d7b2477594e8ff43706644ba8a375ad62a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Feb 2011 14:52:16 +0100 Subject: [PATCH 443/706] x86: ioapic: Use setup function in setup_IO_APIC_irq_extra() Another version of the same thing. Only set the pin programmed, when the setup function succeeds. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 63140d86759f..cfd9611036cc 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1447,7 +1447,7 @@ static void __init setup_IO_APIC_irqs(void) void setup_IO_APIC_irq_extra(u32 gsi) { int apic_id = 0, pin, idx, irq, node = cpu_to_node(0); - struct irq_cfg *cfg; + struct io_apic_irq_attr attr; /* * Convert 'gsi' to 'ioapic.pin'. @@ -1467,21 +1467,17 @@ void setup_IO_APIC_irq_extra(u32 gsi) if (apic_id == 0 || irq < NR_IRQS_LEGACY) return; - cfg = alloc_irq_and_cfg_at(irq, node); - if (!cfg) - return; - - add_pin_to_irq_node(cfg, node, apic_id, pin); - if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) { pr_debug("Pin %d-%d already programmed\n", mp_ioapics[apic_id].apicid, pin); return; } - set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed); - setup_ioapic_irq(apic_id, pin, irq, cfg, - irq_trigger(idx), irq_polarity(idx)); + set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx), + irq_polarity(idx)); + + if (!io_apic_setup_irq_pin(irq, node, &attr)) + set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed); } /* From 41098ffe050c4befe5fc21a5cedd42ebbd6f7469 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Feb 2011 16:08:03 +0100 Subject: [PATCH 444/706] x86: ioapic: Make a few functions static No users outside of io_apic.c. Mark bad_ioapic() __init while at it. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 5 ---- arch/x86/kernel/apic/io_apic.c | 55 +++++++++++++++++----------------- 2 files changed, 28 insertions(+), 32 deletions(-) diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 37dbb3fae395..be61e22dd9ea 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -139,11 +139,6 @@ extern int timer_through_8259; #define io_apic_assign_pci_irqs \ (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs) -extern u8 io_apic_unique_id(u8 id); -extern int io_apic_get_unique_id(int ioapic, int apic_id); -extern int io_apic_get_version(int ioapic); -extern int io_apic_get_redir_entries(int ioapic); - struct io_apic_irq_attr; extern int io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index cfd9611036cc..7344b428e08d 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3611,7 +3611,7 @@ io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr) return ret; } -int __init io_apic_get_redir_entries (int ioapic) +static int __init io_apic_get_redir_entries(int ioapic) { union IO_APIC_reg_01 reg_01; unsigned long flags; @@ -3702,31 +3702,8 @@ int io_apic_set_pci_routing(struct device *dev, int irq, return __io_apic_set_pci_routing(dev, irq, irq_attr); } -u8 __init io_apic_unique_id(u8 id) -{ #ifdef CONFIG_X86_32 - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && - !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) - return io_apic_get_unique_id(nr_ioapics, id); - else - return id; -#else - int i; - DECLARE_BITMAP(used, 256); - - bitmap_zero(used, 256); - for (i = 0; i < nr_ioapics; i++) { - struct mpc_ioapic *ia = &mp_ioapics[i]; - __set_bit(ia->apicid, used); - } - if (!test_bit(id, used)) - return id; - return find_first_zero_bit(used, 256); -#endif -} - -#ifdef CONFIG_X86_32 -int __init io_apic_get_unique_id(int ioapic, int apic_id) +static int __init io_apic_get_unique_id(int ioapic, int apic_id) { union IO_APIC_reg_00 reg_00; static physid_mask_t apic_id_map = PHYSID_MASK_NONE; @@ -3799,9 +3776,33 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) return apic_id; } + +static u8 __init io_apic_unique_id(u8 id) +{ + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && + !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + return io_apic_get_unique_id(nr_ioapics, id); + else + return id; +} +#else +static u8 __init io_apic_unique_id(u8 id) +{ + int i; + DECLARE_BITMAP(used, 256); + + bitmap_zero(used, 256); + for (i = 0; i < nr_ioapics; i++) { + struct mpc_ioapic *ia = &mp_ioapics[i]; + __set_bit(ia->apicid, used); + } + if (!test_bit(id, used)) + return id; + return find_first_zero_bit(used, 256); +} #endif -int __init io_apic_get_version(int ioapic) +static int __init io_apic_get_version(int ioapic) { union IO_APIC_reg_01 reg_01; unsigned long flags; @@ -4004,7 +4005,7 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi) return gsi - mp_gsi_routing[ioapic].gsi_base; } -static int bad_ioapic(unsigned long address) +static __init int bad_ioapic(unsigned long address) { if (nr_ioapics >= MAX_IO_APICS) { printk(KERN_WARNING "WARING: Max # of I/O APICs (%d) exceeded " From b77cf6a8609a8450786c572bc8af6ad068022dbe Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Feb 2011 17:33:53 +0100 Subject: [PATCH 445/706] x86: ioapic: Remove useless inlines There is no point to have irq_trigger() and irq_polarity() as wrappers around the MPBIOS_* camel case functions. Get rid of both the inlines and the ugly camel case. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 7344b428e08d..2d49e4b41c2d 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -821,7 +821,7 @@ static int EISA_ELCR(unsigned int irq) #define default_MCA_trigger(idx) (1) #define default_MCA_polarity(idx) default_ISA_polarity(idx) -static int MPBIOS_polarity(int idx) +static int irq_polarity(int idx) { int bus = mp_irqs[idx].srcbus; int polarity; @@ -863,7 +863,7 @@ static int MPBIOS_polarity(int idx) return polarity; } -static int MPBIOS_trigger(int idx) +static int irq_trigger(int idx) { int bus = mp_irqs[idx].srcbus; int trigger; @@ -935,16 +935,6 @@ static int MPBIOS_trigger(int idx) return trigger; } -static inline int irq_polarity(int idx) -{ - return MPBIOS_polarity(idx); -} - -static inline int irq_trigger(int idx) -{ - return MPBIOS_trigger(idx); -} - static int pin_2_irq(int idx, int apic, int pin) { int irq; From 710dcda64369e3f3704a0eee502ce27dbf9fedc1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Feb 2011 17:47:41 +0100 Subject: [PATCH 446/706] x86: ioapic: Implement and use io_apic_setup_irq_pin_once() io_apic_set_pci_routing() and mp_save_irq() check the pin_programmed bit before calling io_apic_setup_irq_pin() and set the bit when the pin was setup. Move that duplicated code into a separate function and use it. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 57 +++++++++++++++------------------- 1 file changed, 25 insertions(+), 32 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 2d49e4b41c2d..46913ef88ea4 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -128,6 +128,9 @@ static int __init parse_noapic(char *str) } early_param("noapic", parse_noapic); +static int io_apic_setup_irq_pin_once(unsigned int irq, int node, + struct io_apic_irq_attr *attr); + /* Will be called in mpparse/acpi/sfi codes for saving IRQ info */ void mp_save_irq(struct mpc_intsrc *m) { @@ -1457,17 +1460,10 @@ void setup_IO_APIC_irq_extra(u32 gsi) if (apic_id == 0 || irq < NR_IRQS_LEGACY) return; - if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) { - pr_debug("Pin %d-%d already programmed\n", - mp_ioapics[apic_id].apicid, pin); - return; - } - set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx), irq_polarity(idx)); - if (!io_apic_setup_irq_pin(irq, node, &attr)) - set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed); + io_apic_setup_irq_pin_once(irq, node, &attr); } /* @@ -3601,6 +3597,24 @@ io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr) return ret; } +static int io_apic_setup_irq_pin_once(unsigned int irq, int node, + struct io_apic_irq_attr *attr) +{ + unsigned int id = attr->ioapic, pin = attr->ioapic_pin; + int ret; + + /* Avoid redundant programming */ + if (test_bit(pin, mp_ioapic_routing[id].pin_programmed)) { + pr_debug("Pin %d-%d already programmed\n", + mp_ioapics[id].apicid, pin); + return 0; + } + ret = io_apic_setup_irq_pin(irq, node, attr); + if (!ret) + set_bit(pin, mp_ioapic_routing[id].pin_programmed); + return ret; +} + static int __init io_apic_get_redir_entries(int ioapic) { union IO_APIC_reg_01 reg_01; @@ -3655,8 +3669,8 @@ int __init arch_probe_nr_irqs(void) } #endif -static int __io_apic_set_pci_routing(struct device *dev, int irq, - struct io_apic_irq_attr *irq_attr) +int io_apic_set_pci_routing(struct device *dev, int irq, + struct io_apic_irq_attr *irq_attr) { int node; @@ -3668,28 +3682,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq, node = dev ? dev_to_node(dev) : cpu_to_node(0); - return io_apic_setup_irq_pin(irq, node, irq_attr); -} - -int io_apic_set_pci_routing(struct device *dev, int irq, - struct io_apic_irq_attr *irq_attr) -{ - int ioapic, pin; - /* - * Avoid pin reprogramming. PRTs typically include entries - * with redundant pin->gsi mappings (but unique PCI devices); - * we only program the IOAPIC on the first. - */ - ioapic = irq_attr->ioapic; - pin = irq_attr->ioapic_pin; - if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) { - pr_debug("Pin %d-%d already programmed\n", - mp_ioapics[ioapic].apicid, pin); - return 0; - } - set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed); - - return __io_apic_set_pci_routing(dev, irq, irq_attr); + return io_apic_setup_irq_pin_once(irq, node, irq_attr); } #ifdef CONFIG_X86_32 From abb0052289e58140d933b29491f59e4be0a19727 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Feb 2011 19:54:53 +0100 Subject: [PATCH 447/706] x86: ioapic: Move trigger defines to io_apic.h Required for devicetree based io_apic configuration. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 4 ++++ arch/x86/kernel/apic/io_apic.c | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index be61e22dd9ea..c4bd267dfc50 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -95,6 +95,10 @@ struct IR_IO_APIC_route_entry { index : 15; } __attribute__ ((packed)); +#define IOAPIC_AUTO -1 +#define IOAPIC_EDGE 0 +#define IOAPIC_LEVEL 1 + #ifdef CONFIG_X86_IO_APIC /* diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 46913ef88ea4..8d23e831a45e 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1216,10 +1216,6 @@ void __setup_vector_irq(int cpu) static struct irq_chip ioapic_chip; static struct irq_chip ir_ioapic_chip; -#define IOAPIC_AUTO -1 -#define IOAPIC_EDGE 0 -#define IOAPIC_LEVEL 1 - #ifdef CONFIG_X86_32 static inline int IO_APIC_irq_trigger(int irq) { From f1c2b357148ec27fcc6ce0992211209a0ea20d8f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 22 Feb 2011 21:07:36 +0100 Subject: [PATCH 448/706] x86: e820: Remove conditional early mapping in parse_e820_ext This patch ensures that the memory passed from parse_setup_data() is large enough to cover the complete data structure. That means that the conditional mapping in parse_e820_ext() can go. While here, I also attempt not to map two pages if the address is not aligned to a page boundary. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Dirk Brandewie Cc: sodaville@linutronix.de Cc: devicetree-discuss@lists.ozlabs.org LKML-Reference: <1298405266-1624-2-git-send-email-bigeasy@linutronix.de> Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/e820.h | 2 +- arch/x86/kernel/e820.c | 8 +------- arch/x86/kernel/setup.c | 17 ++++++++++++++--- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index e99d55d74df5..908b96957d88 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -96,7 +96,7 @@ extern void e820_setup_gap(void); extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, unsigned long start_addr, unsigned long long end_addr); struct setup_data; -extern void parse_e820_ext(struct setup_data *data, unsigned long pa_data); +extern void parse_e820_ext(struct setup_data *data); #if defined(CONFIG_X86_64) || \ (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION)) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 294f26da0c0c..5fad62684651 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -667,21 +667,15 @@ __init void e820_setup_gap(void) * boot_params.e820_map, others are passed via SETUP_E820_EXT node of * linked list of struct setup_data, which is parsed here. */ -void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data) +void __init parse_e820_ext(struct setup_data *sdata) { - u32 map_len; int entries; struct e820entry *extmap; entries = sdata->len / sizeof(struct e820entry); - map_len = sdata->len + sizeof(struct setup_data); - if (map_len > PAGE_SIZE) - sdata = early_ioremap(pa_data, map_len); extmap = (struct e820entry *)(sdata->data); __append_e820_map(extmap, entries); sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); - if (map_len > PAGE_SIZE) - early_iounmap(sdata, map_len); printk(KERN_INFO "extended physical RAM map:\n"); e820_print_map("extended"); } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ca2f10622a79..9521483ce58c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -429,16 +429,27 @@ static void __init parse_setup_data(void) return; pa_data = boot_params.hdr.setup_data; while (pa_data) { - data = early_memremap(pa_data, PAGE_SIZE); + u32 data_len, map_len; + + map_len = max(PAGE_SIZE - (pa_data & ~PAGE_MASK), + (u64)sizeof(struct setup_data)); + data = early_memremap(pa_data, map_len); + data_len = data->len + sizeof(struct setup_data); + if (data_len > map_len) { + early_iounmap(data, map_len); + data = early_memremap(pa_data, data_len); + map_len = data_len; + } + switch (data->type) { case SETUP_E820_EXT: - parse_e820_ext(data, pa_data); + parse_e820_ext(data); break; default: break; } pa_data = data->next; - early_iounmap(data, PAGE_SIZE); + early_iounmap(data, map_len); } } From da6b737b9ab768dd06bb4b0395131d10e524cf83 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 22 Feb 2011 21:07:37 +0100 Subject: [PATCH 449/706] x86: Add device tree support This patch adds minimal support for device tree on x86. The device tree blob is passed to the kernel via setup_data which requires at least boot protocol 2.09. Memory size, restricted memory regions, boot arguments are gathered the traditional way so things like cmd_line are just here to let the code compile. The current plan is use the device tree as an extension and to gather information which can not be enumerated and would have to be hardcoded otherwise. This includes things like - which devices are on this I2C/SPI bus? - how are the interrupts wired to IO APIC? - where could my hpet be? Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Dirk Brandewie Acked-by: Grant Likely Cc: sodaville@linutronix.de Cc: devicetree-discuss@lists.ozlabs.org LKML-Reference: <1298405266-1624-3-git-send-email-bigeasy@linutronix.de> Signed-off-by: Thomas Gleixner --- .../devicetree/booting-without-of.txt | 20 ++++++++ arch/x86/Kconfig | 2 + arch/x86/include/asm/bootparam.h | 1 + arch/x86/include/asm/irq.h | 3 -- arch/x86/include/asm/prom.h | 48 ++++++++++++++++- arch/x86/kernel/Makefile | 1 + arch/x86/kernel/devicetree.c | 51 +++++++++++++++++++ arch/x86/kernel/irq.c | 9 ---- arch/x86/kernel/setup.c | 4 ++ 9 files changed, 126 insertions(+), 13 deletions(-) create mode 100644 arch/x86/kernel/devicetree.c diff --git a/Documentation/devicetree/booting-without-of.txt b/Documentation/devicetree/booting-without-of.txt index 28b1c9d3d351..55fd2623445b 100644 --- a/Documentation/devicetree/booting-without-of.txt +++ b/Documentation/devicetree/booting-without-of.txt @@ -13,6 +13,7 @@ Table of Contents I - Introduction 1) Entry point for arch/powerpc + 2) Entry point for arch/x86 II - The DT block format 1) Header @@ -225,6 +226,25 @@ it with special cases. cannot support both configurations with Book E and configurations with classic Powerpc architectures. +2) Entry point for arch/x86 +------------------------------- + + There is one single 32bit entry point to the kernel at code32_start, + the decompressor (the real mode entry point goes to the same 32bit + entry point once it switched into protected mode). That entry point + supports one calling convention which is documented in + Documentation/x86/boot.txt + The physical pointer to the device-tree block (defined in chapter II) + is passed via setup_data which requires at least boot protocol 2.09. + The type filed is defined as + + #define SETUP_DTB 2 + + This device-tree is used as an extension to the "boot page". As such it + does not parse / consider data which is already covered by the boot + page. This includes memory size, reserved ranges, command line arguments + or initrd address. It simply holds information which can not be retrieved + otherwise like interrupt routing or a list of devices behind an I2C bus. II - The DT block format ======================== diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index efbae0c7521f..b4c2e9c67623 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -382,6 +382,8 @@ config X86_INTEL_CE depends on X86_32 depends on X86_EXTENDED_PLATFORM select X86_REBOOTFIXUPS + select OF + select OF_EARLY_FLATTREE ---help--- Select for the Intel CE media processor (CE4100) SOC. This option compiles in support for the CE4100 SOC for settop diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h index c8bfe63a06de..e020d88ec02d 100644 --- a/arch/x86/include/asm/bootparam.h +++ b/arch/x86/include/asm/bootparam.h @@ -12,6 +12,7 @@ /* setup data types */ #define SETUP_NONE 0 #define SETUP_E820_EXT 1 +#define SETUP_DTB 2 /* extensible setup data list node */ struct setup_data { diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index c704b38c57a2..ba870bb6dd8e 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -10,9 +10,6 @@ #include #include -/* Even though we don't support this, supply it to appease OF */ -static inline void irq_dispose_mapping(unsigned int virq) { } - static inline int irq_canonicalize(int irq) { return ((irq == 2) ? 9 : irq); diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h index b4ec95f07518..e46f2a2b57a1 100644 --- a/arch/x86/include/asm/prom.h +++ b/arch/x86/include/asm/prom.h @@ -1 +1,47 @@ -/* dummy prom.h; here to make linux/of.h's #includes happy */ +/* + * Definitions for Device tree / OpenFirmware handling on X86 + * + * based on arch/powerpc/include/asm/prom.h which is + * Copyright (C) 1996-2005 Paul Mackerras. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _ASM_X86_PROM_H +#define _ASM_X86_PROM_H +#ifndef __ASSEMBLY__ + +#include +#include + +#include +#include +#include + +#ifdef CONFIG_OF +extern void add_dtb(u64 data); +#else +static inline void add_dtb(u64 data) { } +#endif + +extern char cmd_line[COMMAND_LINE_SIZE]; + +#define pci_address_to_pio pci_address_to_pio +unsigned long pci_address_to_pio(phys_addr_t addr); + +/** + * irq_dispose_mapping - Unmap an interrupt + * @virq: linux virq number of the interrupt to unmap + * + * FIXME: We really should implement proper virq handling like power, + * but that's going to be major surgery. + */ +static inline void irq_dispose_mapping(unsigned int virq) { } + +#define HAVE_ARCH_DEVTREE_FIXUPS + +#endif /* __ASSEMBLY__ */ +#endif diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 34244b2cd880..6ac5036adf42 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -109,6 +109,7 @@ obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o +obj-$(CONFIG_OF) += devicetree.o ### # 64 bit specific files diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c new file mode 100644 index 000000000000..0b8f0daef933 --- /dev/null +++ b/arch/x86/kernel/devicetree.c @@ -0,0 +1,51 @@ +/* + * Architecture specific OF callbacks. + */ +#include +#include +#include +#include +#include +#include +#include + +char __initdata cmd_line[COMMAND_LINE_SIZE]; + +unsigned int irq_create_of_mapping(struct device_node *controller, + const u32 *intspec, unsigned int intsize) +{ + return intspec[0]; + +} +EXPORT_SYMBOL_GPL(irq_create_of_mapping); + +unsigned long pci_address_to_pio(phys_addr_t address) +{ + /* + * The ioport address can be directly used by inX / outX + */ + BUG_ON(address >= (1 << 16)); + return (unsigned long)address; +} +EXPORT_SYMBOL_GPL(pci_address_to_pio); + +void __init early_init_dt_scan_chosen_arch(unsigned long node) +{ + BUG(); +} + +void __init early_init_dt_add_memory_arch(u64 base, u64 size) +{ + BUG(); +} + +void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align) +{ + return __alloc_bootmem(size, align, __pa(MAX_DMA_ADDRESS)); +} + +void __init add_dtb(u64 data) +{ + initial_boot_params = phys_to_virt((u64) (u32) data + + offsetof(struct setup_data, data)); +} diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 387b6a0c9e81..753136003af1 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -276,15 +276,6 @@ void smp_x86_platform_ipi(struct pt_regs *regs) EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); -#ifdef CONFIG_OF -unsigned int irq_create_of_mapping(struct device_node *controller, - const u32 *intspec, unsigned int intsize) -{ - return intspec[0]; -} -EXPORT_SYMBOL_GPL(irq_create_of_mapping); -#endif - #ifdef CONFIG_HOTPLUG_CPU /* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ void fixup_irqs(void) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 9521483ce58c..33dcbce1ab0f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -113,6 +113,7 @@ #endif #include #include +#include /* * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. @@ -445,6 +446,9 @@ static void __init parse_setup_data(void) case SETUP_E820_EXT: parse_e820_ext(data); break; + case SETUP_DTB: + add_dtb(pa_data); + break; default: break; } From df2634f43f5106947f3735a0b61a6527a4b278cd Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 22 Feb 2011 21:07:38 +0100 Subject: [PATCH 450/706] x86: dtb: Add a device tree for CE4100 History: v1..v2: - dropped device_type except for cpu & pci. I have the compatible string for pci so I can drop the device_type once it is possible - I lowercased all compatible types. I will need to resend some patches which have upper case intel - The cpu had the same compatible string as the soc node. So I added to the soc node -immr for internel memory mapped registers. - I added generic names for all parts. - I reworked the i2c bars matching the way you suggested. I added a compatible node for the PCI device which only the PCI ids in its compatible string. The bars (each represents a complete i2c controller) have a "intel,ce4100-i2c-controller" compatible node. It is not used by the driver. The driver is probed via PCI ids (by the pci subsystem not OF) and matches the bar address against the ressource in the child node. Once there is a hit the node is attached. - The SPI driver is also probed via pci. However I also attached a compatible property based on PCI ids v2..v3: - intel,ce4100-immr become intel,ce4100-cp. cp stands for core peripherals. The Atom data sheet talks here about ACPI devices. Since we don't have ACPI this does not apply here. - The interrupt map is gone. There are now plenty of device nodes. - The "unit address string" got fixed, it uses not DD,V format. v3..v4: - added descriptions for compatible nodes introduced here: - intel,ce4100-ioapic - intel,ce4100-lapic - intel,ce4100-hpet - intel,ce4100 - intel,ce4100-cp - intel,ce4100-pci - added a description about I2C controller magic. - Added gpio-controller and gpio-cells property to gpio devices. Those properties are not (yet) used. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Dirk Brandewie Acked-by: Grant Likely Cc: sodaville@linutronix.de Cc: devicetree-discuss@lists.ozlabs.org LKML-Reference: <1298405266-1624-4-git-send-email-bigeasy@linutronix.de> Signed-off-by: Thomas Gleixner --- .../devicetree/bindings/i2c/ce4100-i2c.txt | 93 ++++ .../devicetree/bindings/x86/ce4100.txt | 38 ++ .../devicetree/bindings/x86/interrupt.txt | 29 ++ .../devicetree/bindings/x86/timer.txt | 6 + arch/x86/platform/ce4100/falconfalls.dts | 428 ++++++++++++++++++ 5 files changed, 594 insertions(+) create mode 100644 Documentation/devicetree/bindings/i2c/ce4100-i2c.txt create mode 100644 Documentation/devicetree/bindings/x86/ce4100.txt create mode 100644 Documentation/devicetree/bindings/x86/interrupt.txt create mode 100644 Documentation/devicetree/bindings/x86/timer.txt create mode 100644 arch/x86/platform/ce4100/falconfalls.dts diff --git a/Documentation/devicetree/bindings/i2c/ce4100-i2c.txt b/Documentation/devicetree/bindings/i2c/ce4100-i2c.txt new file mode 100644 index 000000000000..569b16248514 --- /dev/null +++ b/Documentation/devicetree/bindings/i2c/ce4100-i2c.txt @@ -0,0 +1,93 @@ +CE4100 I2C +---------- + +CE4100 has one PCI device which is described as the I2C-Controller. This +PCI device has three PCI-bars, each bar contains a complete I2C +controller. So we have a total of three independent I2C-Controllers +which share only an interrupt line. +The driver is probed via the PCI-ID and is gathering the information of +attached devices from the devices tree. +Grant Likely recommended to use the ranges property to map the PCI-Bar +number to its physical address and to use this to find the child nodes +of the specific I2C controller. This were his exact words: + + Here's where the magic happens. Each entry in + ranges describes how the parent pci address space + (middle group of 3) is translated to the local + address space (first group of 2) and the size of + each range (last cell). In this particular case, + the first cell of the local address is chosen to be + 1:1 mapped to the BARs, and the second is the + offset from be base of the BAR (which would be + non-zero if you had 2 or more devices mapped off + the same BAR) + + ranges allows the address mapping to be described + in a way that the OS can interpret without + requiring custom device driver code. + +This is an example which is used on FalconFalls: +------------------------------------------------ + i2c-controller@b,2 { + #address-cells = <2>; + #size-cells = <1>; + compatible = "pci8086,2e68.2", + "pci8086,2e68", + "pciclass,ff0000", + "pciclass,ff00"; + + reg = <0x15a00 0x0 0x0 0x0 0x0>; + interrupts = <16 1>; + + /* as described by Grant, the first number in the group of + * three is the bar number followed by the 64bit bar address + * followed by size of the mapping. The bar address + * requires also a valid translation in parents ranges + * property. + */ + ranges = <0 0 0x02000000 0 0xdffe0500 0x100 + 1 0 0x02000000 0 0xdffe0600 0x100 + 2 0 0x02000000 0 0xdffe0700 0x100>; + + i2c@0 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "intel,ce4100-i2c-controller"; + + /* The first number in the reg property is the + * number of the bar + */ + reg = <0 0 0x100>; + + /* This I2C controller has no devices */ + }; + + i2c@1 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "intel,ce4100-i2c-controller"; + reg = <1 0 0x100>; + + /* This I2C controller has one gpio controller */ + gpio@26 { + #gpio-cells = <2>; + compatible = "ti,pcf8575"; + reg = <0x26>; + gpio-controller; + }; + }; + + i2c@2 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "intel,ce4100-i2c-controller"; + reg = <2 0 0x100>; + + gpio@26 { + #gpio-cells = <2>; + compatible = "ti,pcf8575"; + reg = <0x26>; + gpio-controller; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/x86/ce4100.txt b/Documentation/devicetree/bindings/x86/ce4100.txt new file mode 100644 index 000000000000..b49ae593a60b --- /dev/null +++ b/Documentation/devicetree/bindings/x86/ce4100.txt @@ -0,0 +1,38 @@ +CE4100 Device Tree Bindings +--------------------------- + +The CE4100 SoC uses for in core peripherals the following compatible +format: ,-. +Many of the "generic" devices like HPET or IO APIC have the ce4100 +name in their compatible property because they first appeared in this +SoC. + +The CPU node +------------ + cpu@0 { + device_type = "cpu"; + compatible = "intel,ce4100"; + reg = <0>; + lapic = <&lapic0>; + }; + +The reg property describes the CPU number. The lapic property points to +the local APIC timer. + +The SoC node +------------ + +This node describes the in-core peripherals. Required property: + compatible = "intel,ce4100-cp"; + +The PCI node +------------ +This node describes the PCI bus on the SoC. Its property should be + compatible = "intel,ce4100-pci", "pci"; + +If the OS is using the IO-APIC for interrupt routing then the reported +interrupt numbers for devices is no longer true. In order to obtain the +correct interrupt number, the child node which represents the device has +to contain the interrupt property. Besides the interrupt property it has +to contain at least the reg property containing the PCI bus address and +compatible property according to "PCI Bus Binding Revision 2.1". diff --git a/Documentation/devicetree/bindings/x86/interrupt.txt b/Documentation/devicetree/bindings/x86/interrupt.txt new file mode 100644 index 000000000000..8b0efb097e60 --- /dev/null +++ b/Documentation/devicetree/bindings/x86/interrupt.txt @@ -0,0 +1,29 @@ +Interrupt chips +--------------- + +* Intel I/O Advanced Programmable Interrupt Controller (IO APIC) + + Required properties: + -------------------- + compatible = "intel,ce4100-ioapic"; + #interrupt-cells = <2>; + + Device's interrupt property: + + interrupts =

; + + The first number (P) represents the interrupt pin which is wired to the + IO APIC. The second number (S) represents the sense of interrupt which + should be configured and can be one of: + 0 - Edge Rising + 1 - Level Low + 2 - Level High + 3 - Edge Falling + +* Local APIC + Required property: + + compatible = "intel,ce4100-lapic"; + + This node is currently unused by Linux as the address of the local APIC + read from a MSR. diff --git a/Documentation/devicetree/bindings/x86/timer.txt b/Documentation/devicetree/bindings/x86/timer.txt new file mode 100644 index 000000000000..c688af58e3bd --- /dev/null +++ b/Documentation/devicetree/bindings/x86/timer.txt @@ -0,0 +1,6 @@ +Timers +------ + +* High Precision Event Timer (HPET) + Required property: + compatible = "intel,ce4100-hpet"; diff --git a/arch/x86/platform/ce4100/falconfalls.dts b/arch/x86/platform/ce4100/falconfalls.dts new file mode 100644 index 000000000000..dc701ea58546 --- /dev/null +++ b/arch/x86/platform/ce4100/falconfalls.dts @@ -0,0 +1,428 @@ +/* + * CE4100 on Falcon Falls + * + * (c) Copyright 2010 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; version 2 of the License. + */ +/dts-v1/; +/ { + model = "intel,falconfalls"; + compatible = "intel,falconfalls"; + #address-cells = <1>; + #size-cells = <1>; + + cpus { + #address-cells = <1>; + #size-cells = <0>; + + cpu@0 { + device_type = "cpu"; + compatible = "intel,ce4100"; + reg = <0>; + lapic = <&lapic0>; + }; + }; + + soc@0 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "intel,ce4100-cp"; + ranges; + + ioapic1: interrupt-controller@fec00000 { + #interrupt-cells = <2>; + compatible = "intel,ce4100-ioapic"; + interrupt-controller; + reg = <0xfec00000 0x1000>; + }; + + timer@fed00000 { + compatible = "intel,ce4100-hpet"; + reg = <0xfed00000 0x200>; + }; + + lapic0: interrupt-controller@fee00000 { + compatible = "intel,ce4100-lapic"; + reg = <0xfee00000 0x1000>; + }; + + pci@3fc { + #address-cells = <3>; + #size-cells = <2>; + compatible = "intel,ce4100-pci", "pci"; + device_type = "pci"; + bus-range = <0 0>; + ranges = <0x2000000 0 0xbffff000 0xbffff000 0 0x1000 + 0x2000000 0 0xdffe0000 0xdffe0000 0 0x1000 + 0x0000000 0 0x0 0x0 0 0x100>; + + /* Secondary IO-APIC */ + ioapic2: interrupt-controller@0,1 { + #interrupt-cells = <2>; + compatible = "intel,ce4100-ioapic"; + interrupt-controller; + reg = <0x100 0x0 0x0 0x0 0x0>; + assigned-addresses = <0x02000000 0x0 0xbffff000 0x0 0x1000>; + }; + + pci@1,0 { + #address-cells = <3>; + #size-cells = <2>; + compatible = "intel,ce4100-pci", "pci"; + device_type = "pci"; + bus-range = <1 1>; + ranges = <0x2000000 0 0xdffe0000 0x2000000 0 0xdffe0000 0 0x1000>; + + interrupt-parent = <&ioapic2>; + + display@2,0 { + compatible = "pci8086,2e5b.2", + "pci8086,2e5b", + "pciclass038000", + "pciclass0380"; + + reg = <0x11000 0x0 0x0 0x0 0x0>; + interrupts = <0 1>; + }; + + multimedia@3,0 { + compatible = "pci8086,2e5c.2", + "pci8086,2e5c", + "pciclass048000", + "pciclass0480"; + + reg = <0x11800 0x0 0x0 0x0 0x0>; + interrupts = <2 1>; + }; + + multimedia@4,0 { + compatible = "pci8086,2e5d.2", + "pci8086,2e5d", + "pciclass048000", + "pciclass0480"; + + reg = <0x12000 0x0 0x0 0x0 0x0>; + interrupts = <4 1>; + }; + + multimedia@4,1 { + compatible = "pci8086,2e5e.2", + "pci8086,2e5e", + "pciclass048000", + "pciclass0480"; + + reg = <0x12100 0x0 0x0 0x0 0x0>; + interrupts = <5 1>; + }; + + sound@6,0 { + compatible = "pci8086,2e5f.2", + "pci8086,2e5f", + "pciclass040100", + "pciclass0401"; + + reg = <0x13000 0x0 0x0 0x0 0x0>; + interrupts = <6 1>; + }; + + sound@6,1 { + compatible = "pci8086,2e5f.2", + "pci8086,2e5f", + "pciclass040100", + "pciclass0401"; + + reg = <0x13100 0x0 0x0 0x0 0x0>; + interrupts = <7 1>; + }; + + sound@6,2 { + compatible = "pci8086,2e60.2", + "pci8086,2e60", + "pciclass040100", + "pciclass0401"; + + reg = <0x13200 0x0 0x0 0x0 0x0>; + interrupts = <8 1>; + }; + + display@8,0 { + compatible = "pci8086,2e61.2", + "pci8086,2e61", + "pciclass038000", + "pciclass0380"; + + reg = <0x14000 0x0 0x0 0x0 0x0>; + interrupts = <9 1>; + }; + + display@8,1 { + compatible = "pci8086,2e62.2", + "pci8086,2e62", + "pciclass038000", + "pciclass0380"; + + reg = <0x14100 0x0 0x0 0x0 0x0>; + interrupts = <10 1>; + }; + + multimedia@8,2 { + compatible = "pci8086,2e63.2", + "pci8086,2e63", + "pciclass048000", + "pciclass0480"; + + reg = <0x14200 0x0 0x0 0x0 0x0>; + interrupts = <11 1>; + }; + + entertainment-encryption@9,0 { + compatible = "pci8086,2e64.2", + "pci8086,2e64", + "pciclass101000", + "pciclass1010"; + + reg = <0x14800 0x0 0x0 0x0 0x0>; + interrupts = <12 1>; + }; + + localbus@a,0 { + compatible = "pci8086,2e65.2", + "pci8086,2e65", + "pciclassff0000", + "pciclassff00"; + + reg = <0x15000 0x0 0x0 0x0 0x0>; + }; + + serial@b,0 { + compatible = "pci8086,2e66.2", + "pci8086,2e66", + "pciclass070003", + "pciclass0700"; + + reg = <0x15800 0x0 0x0 0x0 0x0>; + interrupts = <14 1>; + }; + + gpio@b,1 { + compatible = "pci8086,2e67.2", + "pci8086,2e67", + "pciclassff0000", + "pciclassff00"; + + #gpio-cells = <2>; + reg = <0x15900 0x0 0x0 0x0 0x0>; + interrupts = <15 1>; + gpio-controller; + }; + + i2c-controller@b,2 { + #address-cells = <2>; + #size-cells = <1>; + compatible = "pci8086,2e68.2", + "pci8086,2e68", + "pciclass,ff0000", + "pciclass,ff00"; + + reg = <0x15a00 0x0 0x0 0x0 0x0>; + interrupts = <16 1>; + ranges = <0 0 0x02000000 0 0xdffe0500 0x100 + 1 0 0x02000000 0 0xdffe0600 0x100 + 2 0 0x02000000 0 0xdffe0700 0x100>; + + i2c@0 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "intel,ce4100-i2c-controller"; + reg = <0 0 0x100>; + }; + + i2c@1 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "intel,ce4100-i2c-controller"; + reg = <1 0 0x100>; + + gpio@26 { + #gpio-cells = <2>; + compatible = "ti,pcf8575"; + reg = <0x26>; + gpio-controller; + }; + }; + + i2c@2 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "intel,ce4100-i2c-controller"; + reg = <2 0 0x100>; + + gpio@26 { + #gpio-cells = <2>; + compatible = "ti,pcf8575"; + reg = <0x26>; + gpio-controller; + }; + }; + }; + + smard-card@b,3 { + compatible = "pci8086,2e69.2", + "pci8086,2e69", + "pciclass070500", + "pciclass0705"; + + reg = <0x15b00 0x0 0x0 0x0 0x0>; + interrupts = <15 1>; + }; + + spi-controller@b,4 { + #address-cells = <1>; + #size-cells = <0>; + compatible = + "pci8086,2e6a.2", + "pci8086,2e6a", + "pciclass,ff0000", + "pciclass,ff00"; + + reg = <0x15c00 0x0 0x0 0x0 0x0>; + interrupts = <15 1>; + + dac@0 { + compatible = "ti,pcm1755"; + reg = <0>; + spi-max-frequency = <115200>; + }; + + dac@1 { + compatible = "ti,pcm1609a"; + reg = <1>; + spi-max-frequency = <115200>; + }; + + eeprom@2 { + compatible = "atmel,at93c46"; + reg = <2>; + spi-max-frequency = <115200>; + }; + }; + + multimedia@b,7 { + compatible = "pci8086,2e6d.2", + "pci8086,2e6d", + "pciclassff0000", + "pciclassff00"; + + reg = <0x15f00 0x0 0x0 0x0 0x0>; + }; + + ethernet@c,0 { + compatible = "pci8086,2e6e.2", + "pci8086,2e6e", + "pciclass020000", + "pciclass0200"; + + reg = <0x16000 0x0 0x0 0x0 0x0>; + interrupts = <21 1>; + }; + + clock@c,1 { + compatible = "pci8086,2e6f.2", + "pci8086,2e6f", + "pciclassff0000", + "pciclassff00"; + + reg = <0x16100 0x0 0x0 0x0 0x0>; + interrupts = <3 1>; + }; + + usb@d,0 { + compatible = "pci8086,2e70.2", + "pci8086,2e70", + "pciclass0c0320", + "pciclass0c03"; + + reg = <0x16800 0x0 0x0 0x0 0x0>; + interrupts = <22 3>; + }; + + usb@d,1 { + compatible = "pci8086,2e70.2", + "pci8086,2e70", + "pciclass0c0320", + "pciclass0c03"; + + reg = <0x16900 0x0 0x0 0x0 0x0>; + interrupts = <22 3>; + }; + + sata@e,0 { + compatible = "pci8086,2e71.0", + "pci8086,2e71", + "pciclass010601", + "pciclass0106"; + + reg = <0x17000 0x0 0x0 0x0 0x0>; + interrupts = <23 3>; + }; + + flash@f,0 { + compatible = "pci8086,701.1", + "pci8086,701", + "pciclass050100", + "pciclass0501"; + + reg = <0x17800 0x0 0x0 0x0 0x0>; + interrupts = <13 1>; + }; + + entertainment-encryption@10,0 { + compatible = "pci8086,702.1", + "pci8086,702", + "pciclass101000", + "pciclass1010"; + + reg = <0x18000 0x0 0x0 0x0 0x0>; + }; + + co-processor@11,0 { + compatible = "pci8086,703.1", + "pci8086,703", + "pciclass0b4000", + "pciclass0b40"; + + reg = <0x18800 0x0 0x0 0x0 0x0>; + interrupts = <1 1>; + }; + + multimedia@12,0 { + compatible = "pci8086,704.0", + "pci8086,704", + "pciclass048000", + "pciclass0480"; + + reg = <0x19000 0x0 0x0 0x0 0x0>; + }; + }; + + isa@1f,0 { + #address-cells = <2>; + #size-cells = <1>; + compatible = "isa"; + ranges = <1 0 0 0 0 0x100>; + + rtc@70 { + compatible = "intel,ce4100-rtc", "motorola,mc146818"; + interrupts = <8 3>; + interrupt-parent = <&ioapic1>; + ctrl-reg = <2>; + freq-reg = <0x26>; + reg = <1 0x70 2>; + }; + }; + }; + }; +}; From 19c4f5f7f7e9c5db89a91627af2a426cfb5568de Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 22 Feb 2011 21:07:39 +0100 Subject: [PATCH 451/706] x86: dtb: Add irq domain abstraction The here introduced irq_domain abstraction represents a generic irq controller. It is a subset of powerpc's irq_host which is going to be renamed to irq_domain and then become generic. This implementation will be removed once it is generic. The xlate callback is resposible to parse irq informations like irq type and number and returns the hardware irq number which is reported by the hardware as active. Signed-off-by: Sebastian Andrzej Siewior Tested-by: Dirk Brandewie Acked-by: Grant Likely Cc: sodaville@linutronix.de Cc: devicetree-discuss@lists.ozlabs.org LKML-Reference: <1298405266-1624-5-git-send-email-bigeasy@linutronix.de> Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/irq_controller.h | 12 +++++++ arch/x86/include/asm/prom.h | 2 ++ arch/x86/kernel/devicetree.c | 46 ++++++++++++++++++++++++++- 3 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 arch/x86/include/asm/irq_controller.h diff --git a/arch/x86/include/asm/irq_controller.h b/arch/x86/include/asm/irq_controller.h new file mode 100644 index 000000000000..423bbbddf36d --- /dev/null +++ b/arch/x86/include/asm/irq_controller.h @@ -0,0 +1,12 @@ +#ifndef __IRQ_CONTROLLER__ +#define __IRQ_CONTROLLER__ + +struct irq_domain { + int (*xlate)(struct irq_domain *h, const u32 *intspec, u32 intsize, + u32 *out_hwirq, u32 *out_type); + void *priv; + struct device_node *controller; + struct list_head l; +}; + +#endif diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h index e46f2a2b57a1..83833ac33dd1 100644 --- a/arch/x86/include/asm/prom.h +++ b/arch/x86/include/asm/prom.h @@ -20,9 +20,11 @@ #include #include #include +#include #ifdef CONFIG_OF extern void add_dtb(u64 data); +void add_interrupt_host(struct irq_domain *ih); #else static inline void add_dtb(u64 data) { } #endif diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 0b8f0daef933..ef98e4edada1 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -3,19 +3,63 @@ */ #include #include +#include #include #include #include #include #include +#include + char __initdata cmd_line[COMMAND_LINE_SIZE]; +static LIST_HEAD(irq_domains); +static DEFINE_RAW_SPINLOCK(big_irq_lock); + +void add_interrupt_host(struct irq_domain *ih) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&big_irq_lock, flags); + list_add(&ih->l, &irq_domains); + raw_spin_unlock_irqrestore(&big_irq_lock, flags); +} + +static struct irq_domain *get_ih_from_node(struct device_node *controller) +{ + struct irq_domain *ih, *found = NULL; + unsigned long flags; + + raw_spin_lock_irqsave(&big_irq_lock, flags); + list_for_each_entry(ih, &irq_domains, l) { + if (ih->controller == controller) { + found = ih; + break; + } + } + raw_spin_unlock_irqrestore(&big_irq_lock, flags); + return found; +} unsigned int irq_create_of_mapping(struct device_node *controller, const u32 *intspec, unsigned int intsize) { - return intspec[0]; + struct irq_domain *ih; + u32 virq, type; + int ret; + ih = get_ih_from_node(controller); + if (!ih) + return 0; + ret = ih->xlate(ih, intspec, intsize, &virq, &type); + if (ret) + return ret; + if (type == IRQ_TYPE_NONE) + return virq; + /* set the mask if it is different from current */ + if (type == (irq_to_desc(virq)->status & IRQF_TRIGGER_MASK)) + set_irq_type(virq, type); + return virq; } EXPORT_SYMBOL_GPL(irq_create_of_mapping); From 3879a6f32948330782889cebc4d74c4f2316c676 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 22 Feb 2011 21:07:40 +0100 Subject: [PATCH 452/706] x86: dtb: Add early parsing of IO_APIC APIC and IO_APIC have to be added to the system early because native_init_IRQ() requires it. In order to obtain the address of the ioapic the device tree has to be unflattened so of_address_to_resource() works. The device tree is relocated to ensure it is always covered by the kernel mapping. That way the boot loader does not have to make any assumptions about kernel's memory layout. Signed-off-by: Sebastian Andrzej Siewior Acked-by: Grant Likely Cc: sodaville@linutronix.de Cc: devicetree-discuss@lists.ozlabs.org Cc: Dirk Brandewie LKML-Reference: <1298405266-1624-6-git-send-email-bigeasy@linutronix.de> Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/prom.h | 7 +++ arch/x86/kernel/devicetree.c | 110 ++++++++++++++++++++++++++++++++++- arch/x86/kernel/irqinit.c | 3 +- 3 files changed, 117 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h index 83833ac33dd1..35ec32b47500 100644 --- a/arch/x86/include/asm/prom.h +++ b/arch/x86/include/asm/prom.h @@ -23,10 +23,17 @@ #include #ifdef CONFIG_OF +extern int of_ioapic; +extern u64 initial_dtb; extern void add_dtb(u64 data); +void x86_dtb_find_config(void); +void x86_dtb_get_config(unsigned int unused); void add_interrupt_host(struct irq_domain *ih); #else static inline void add_dtb(u64 data) { } +#define x86_dtb_find_config x86_init_noop +#define x86_dtb_get_config x86_init_uint_noop +#define of_ioapic 0 #endif extern char cmd_line[COMMAND_LINE_SIZE]; diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index ef98e4edada1..2739d5613a38 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -7,15 +7,20 @@ #include #include #include +#include #include #include #include +#include +__initdata u64 initial_dtb; char __initdata cmd_line[COMMAND_LINE_SIZE]; static LIST_HEAD(irq_domains); static DEFINE_RAW_SPINLOCK(big_irq_lock); +int __initdata of_ioapic; + void add_interrupt_host(struct irq_domain *ih) { unsigned long flags; @@ -90,6 +95,107 @@ void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align) void __init add_dtb(u64 data) { - initial_boot_params = phys_to_virt((u64) (u32) data + - offsetof(struct setup_data, data)); + initial_dtb = data + offsetof(struct setup_data, data); +} + +static void __init dtb_lapic_setup(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + if (apic_force_enable()) + return; + + smp_found_config = 1; + pic_mode = 1; + /* Required for ioapic registration */ + set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); + if (boot_cpu_physical_apicid == -1U) + boot_cpu_physical_apicid = read_apic_id(); + + generic_processor_info(boot_cpu_physical_apicid, + GET_APIC_VERSION(apic_read(APIC_LVR))); +#endif +} + +#ifdef CONFIG_X86_IO_APIC +static unsigned int ioapic_id; + +static void __init dtb_add_ioapic(struct device_node *dn) +{ + struct resource r; + int ret; + + ret = of_address_to_resource(dn, 0, &r); + if (ret) { + printk(KERN_ERR "Can't obtain address from node %s.\n", + dn->full_name); + return; + } + mp_register_ioapic(++ioapic_id, r.start, gsi_top); +} + +static void __init dtb_ioapic_setup(void) +{ + struct device_node *dn; + + if (!smp_found_config) + return; + + for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic") + dtb_add_ioapic(dn); + + if (nr_ioapics) { + of_ioapic = 1; + return; + } + printk(KERN_ERR "Error: No information about IO-APIC in OF.\n"); + smp_found_config = 0; +} +#else +static void __init dtb_ioapic_setup(void) {} +#endif + +static void __init dtb_apic_setup(void) +{ + dtb_lapic_setup(); + dtb_ioapic_setup(); +} + +void __init x86_dtb_find_config(void) +{ + if (initial_dtb) + smp_found_config = 1; + else + printk(KERN_ERR "Missing device tree!.\n"); +} + +void __init x86_dtb_get_config(unsigned int unused) +{ + u32 size, map_len; + void *new_dtb; + + if (!initial_dtb) + return; + + map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), + (u64)sizeof(struct boot_param_header)); + + initial_boot_params = early_memremap(initial_dtb, map_len); + size = be32_to_cpu(initial_boot_params->totalsize); + if (map_len < size) { + early_iounmap(initial_boot_params, map_len); + initial_boot_params = early_memremap(initial_dtb, size); + map_len = size; + } + + new_dtb = alloc_bootmem(size); + memcpy(new_dtb, initial_boot_params, size); + early_iounmap(initial_boot_params, map_len); + + initial_boot_params = new_dtb; + + /* root level address cells */ + of_scan_flat_dt(early_init_dt_scan_root, NULL); + + unflatten_device_tree(); + dtb_apic_setup(); } diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index c752e973958d..4cadf8688dbd 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -25,6 +25,7 @@ #include #include #include +#include /* * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: @@ -243,7 +244,7 @@ void __init native_init_IRQ(void) set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); } - if (!acpi_ioapic) + if (!acpi_ioapic && !of_ioapic) setup_irq(2, &irq2); #ifdef CONFIG_X86_32 From ffb9fc68dff38f811eeb24c15aba0418b6a8ee53 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 22 Feb 2011 21:07:41 +0100 Subject: [PATCH 453/706] x86: dtb: Add device tree support for HPET Set hpet_address based on information provied form DTB Signed-off-by: Sebastian Andrzej Siewior Acked-by: Grant Likely Cc: sodaville@linutronix.de Cc: devicetree-discuss@lists.ozlabs.org Cc: Dirk Brandewie LKML-Reference: <1298405266-1624-7-git-send-email-bigeasy@linutronix.de> Signed-off-by: Thomas Gleixner --- arch/x86/kernel/devicetree.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 2739d5613a38..dbb3bda40af9 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -98,6 +99,23 @@ void __init add_dtb(u64 data) initial_dtb = data + offsetof(struct setup_data, data); } +static void __init dtb_setup_hpet(void) +{ + struct device_node *dn; + struct resource r; + int ret; + + dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-hpet"); + if (!dn) + return; + ret = of_address_to_resource(dn, 0, &r); + if (ret) { + WARN_ON(1); + return; + } + hpet_address = r.start; +} + static void __init dtb_lapic_setup(void) { #ifdef CONFIG_X86_LOCAL_APIC @@ -197,5 +215,6 @@ void __init x86_dtb_get_config(unsigned int unused) of_scan_flat_dt(early_init_dt_scan_root, NULL); unflatten_device_tree(); + dtb_setup_hpet(); dtb_apic_setup(); } From 96e0a0797eba35b5420c710b928f19094b2d5c45 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 22 Feb 2011 21:07:42 +0100 Subject: [PATCH 454/706] x86: dtb: Add support for PCI devices backed by dtb nodes x86_of_pci_init() does two things: - it provides a generic irq enable and disable function. enable queries the device tree for the interrupt information, calls ->xlate on the irq host and updates the pci->irq information for the device. - it walks through PCI bus(es) in the device tree and adds its children (device) nodes to appropriate pci_dev nodes in kernel. So the dtb node information is available at probe time of the PCI device. Adding a PCI bus based on the information in the device tree is currently not supported. Right now direct access via ioports is used. Signed-off-by: Sebastian Andrzej Siewior Tested-by: Dirk Brandewie Acked-by: Grant Likely Cc: sodaville@linutronix.de Cc: devicetree-discuss@lists.ozlabs.org LKML-Reference: <1298405266-1624-8-git-send-email-bigeasy@linutronix.de> Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/prom.h | 14 ++++++ arch/x86/kernel/devicetree.c | 83 ++++++++++++++++++++++++++++++++++++ drivers/of/Kconfig | 2 +- drivers/of/of_pci.c | 1 + 4 files changed, 99 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h index 35ec32b47500..8fcd519a44dc 100644 --- a/arch/x86/include/asm/prom.h +++ b/arch/x86/include/asm/prom.h @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -29,8 +30,21 @@ extern void add_dtb(u64 data); void x86_dtb_find_config(void); void x86_dtb_get_config(unsigned int unused); void add_interrupt_host(struct irq_domain *ih); +void __cpuinit x86_of_pci_init(void); + +static inline struct device_node *pci_device_to_OF_node(struct pci_dev *pdev) +{ + return pdev ? pdev->dev.of_node : NULL; +} + +static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus) +{ + return pci_device_to_OF_node(bus->self); +} + #else static inline void add_dtb(u64 data) { } +static inline void x86_of_pci_init(void) { } #define x86_dtb_find_config x86_init_noop #define x86_dtb_get_config x86_init_uint_noop #define of_ioapic 0 diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index dbb3bda40af9..7b574226e0a8 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -9,11 +9,15 @@ #include #include #include +#include #include +#include +#include #include #include #include +#include __initdata u64 initial_dtb; char __initdata cmd_line[COMMAND_LINE_SIZE]; @@ -99,6 +103,85 @@ void __init add_dtb(u64 data) initial_dtb = data + offsetof(struct setup_data, data); } +#ifdef CONFIG_PCI +static int x86_of_pci_irq_enable(struct pci_dev *dev) +{ + struct of_irq oirq; + u32 virq; + int ret; + u8 pin; + + ret = pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); + if (ret) + return ret; + if (!pin) + return 0; + + ret = of_irq_map_pci(dev, &oirq); + if (ret) + return ret; + + virq = irq_create_of_mapping(oirq.controller, oirq.specifier, + oirq.size); + if (virq == 0) + return -EINVAL; + dev->irq = virq; + return 0; +} + +static void x86_of_pci_irq_disable(struct pci_dev *dev) +{ +} + +void __cpuinit x86_of_pci_init(void) +{ + struct device_node *np; + + pcibios_enable_irq = x86_of_pci_irq_enable; + pcibios_disable_irq = x86_of_pci_irq_disable; + + for_each_node_by_type(np, "pci") { + const void *prop; + struct pci_bus *bus; + unsigned int bus_min; + struct device_node *child; + + prop = of_get_property(np, "bus-range", NULL); + if (!prop) + continue; + bus_min = be32_to_cpup(prop); + + bus = pci_find_bus(0, bus_min); + if (!bus) { + printk(KERN_ERR "Can't find a node for bus %s.\n", + np->full_name); + continue; + } + + if (bus->self) + bus->self->dev.of_node = np; + else + bus->dev.of_node = np; + + for_each_child_of_node(np, child) { + struct pci_dev *dev; + u32 devfn; + + prop = of_get_property(child, "reg", NULL); + if (!prop) + continue; + + devfn = (be32_to_cpup(prop) >> 8) & 0xff; + dev = pci_get_slot(bus, devfn); + if (!dev) + continue; + dev->dev.of_node = child; + pci_dev_put(dev); + } + } +} +#endif + static void __init dtb_setup_hpet(void) { struct device_node *dn; diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig index efabbf9dd607..d06a6374ed6c 100644 --- a/drivers/of/Kconfig +++ b/drivers/of/Kconfig @@ -71,7 +71,7 @@ config OF_MDIO config OF_PCI def_tristate PCI - depends on PCI && (PPC || MICROBLAZE) + depends on PCI && (PPC || MICROBLAZE || X86) help OpenFirmware PCI bus accessors diff --git a/drivers/of/of_pci.c b/drivers/of/of_pci.c index 314535fa32c1..ac1ec54e4fd5 100644 --- a/drivers/of/of_pci.c +++ b/drivers/of/of_pci.c @@ -1,5 +1,6 @@ #include #include +#include #include /** From 9079b35364e75ce6b968a179f861d2f819f33e61 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 22 Feb 2011 21:07:43 +0100 Subject: [PATCH 455/706] x86: dtb: Add generic bus probe For now we probe these busses and we change this to board dependent probes once we have to. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Dirk Brandewie Acked-by: Grant Likely Cc: sodaville@linutronix.de Cc: devicetree-discuss@lists.ozlabs.org LKML-Reference: <1298405266-1624-9-git-send-email-bigeasy@linutronix.de> Signed-off-by: Thomas Gleixner --- arch/x86/kernel/devicetree.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 7b574226e0a8..0bc83bfa4ead 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -103,6 +103,25 @@ void __init add_dtb(u64 data) initial_dtb = data + offsetof(struct setup_data, data); } +/* + * CE4100 ids. Will be moved to machine_device_initcall() once we have it. + */ +static struct of_device_id __initdata ce4100_ids[] = { + { .compatible = "intel,ce4100-cp", }, + { .compatible = "isa", }, + { .compatible = "pci", }, + {}, +}; + +static int __init add_bus_probe(void) +{ + if (!initial_boot_params) + return 0; + + return of_platform_bus_probe(NULL, ce4100_ids, NULL); +} +module_init(add_bus_probe); + #ifdef CONFIG_PCI static int x86_of_pci_irq_enable(struct pci_dev *dev) { From bcc7c1244fcfd852b9f4590935491057e1cab9dd Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 22 Feb 2011 21:07:44 +0100 Subject: [PATCH 456/706] x86: ioapic: Add OF bindings for IO_APIC ioapic_xlate provides a translation from the information in device tree to ioapic related informations. This includes - obtaining hw irq which is the vector number "=> pin number + gsi" - obtaining type (level/edge/..) - programming this information into ioapic ioapic_add_ofnode adds an irq_domain based on informations from the device tree. This information (irq_domain) is required in order to map a device to its proper interrupt controller. [ tglx: Adapted to the io_apic changes, which let us move that whole code to devicetree.c ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Dirk Brandewie Acked-by: Grant Likely Cc: sodaville@linutronix.de Cc: devicetree-discuss@lists.ozlabs.org LKML-Reference: <1298405266-1624-10-git-send-email-bigeasy@linutronix.de> Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/prom.h | 2 + arch/x86/kernel/devicetree.c | 104 +++++++++++++++++++++++++++++++++++ arch/x86/kernel/irqinit.c | 6 ++ 3 files changed, 112 insertions(+) diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h index 8fcd519a44dc..ae50131ad766 100644 --- a/arch/x86/include/asm/prom.h +++ b/arch/x86/include/asm/prom.h @@ -27,6 +27,7 @@ extern int of_ioapic; extern u64 initial_dtb; extern void add_dtb(u64 data); +extern void x86_add_irq_domains(void); void x86_dtb_find_config(void); void x86_dtb_get_config(unsigned int unused); void add_interrupt_host(struct irq_domain *ih); @@ -44,6 +45,7 @@ static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus) #else static inline void add_dtb(u64 data) { } +static inline void x86_add_irq_domains(void) { } static inline void x86_of_pci_init(void) { } #define x86_dtb_find_config x86_init_noop #define x86_dtb_get_config x86_init_uint_noop diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 0bc83bfa4ead..2d65897a39d0 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -320,3 +320,107 @@ void __init x86_dtb_get_config(unsigned int unused) dtb_setup_hpet(); dtb_apic_setup(); } + +#ifdef CONFIG_X86_IO_APIC + +struct of_ioapic_type { + u32 out_type; + u32 trigger; + u32 polarity; +}; + +static struct of_ioapic_type of_ioapic_type[] = +{ + { + .out_type = IRQ_TYPE_EDGE_RISING, + .trigger = IOAPIC_EDGE, + .polarity = 1, + }, + { + .out_type = IRQ_TYPE_LEVEL_LOW, + .trigger = IOAPIC_LEVEL, + .polarity = 0, + }, + { + .out_type = IRQ_TYPE_LEVEL_HIGH, + .trigger = IOAPIC_LEVEL, + .polarity = 1, + }, + { + .out_type = IRQ_TYPE_EDGE_FALLING, + .trigger = IOAPIC_EDGE, + .polarity = 0, + }, +}; + +static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize, + u32 *out_hwirq, u32 *out_type) +{ + struct io_apic_irq_attr attr; + struct of_ioapic_type *it; + u32 line, idx, type; + + if (intsize < 2) + return -EINVAL; + + line = *intspec; + idx = (u32) id->priv; + *out_hwirq = line + mp_gsi_routing[idx].gsi_base; + + intspec++; + type = *intspec; + + if (type >= ARRAY_SIZE(of_ioapic_type)) + return -EINVAL; + + it = of_ioapic_type + type; + *out_type = it->out_type; + + set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity); + + return io_apic_setup_irq_pin(*out_hwirq, cpu_to_node(0), &attr); +} + +static void __init ioapic_add_ofnode(struct device_node *np) +{ + struct resource r; + int i, ret; + + ret = of_address_to_resource(np, 0, &r); + if (ret) { + printk(KERN_ERR "Failed to obtain address for %s\n", + np->full_name); + return; + } + + for (i = 0; i < nr_ioapics; i++) { + if (r.start == mp_ioapics[i].apicaddr) { + struct irq_domain *id; + + id = kzalloc(sizeof(*id), GFP_KERNEL); + BUG_ON(!id); + id->controller = np; + id->xlate = ioapic_xlate; + id->priv = (void *)i; + add_interrupt_host(id); + return; + } + } + printk(KERN_ERR "IOxAPIC at %s is not registered.\n", np->full_name); +} + +void __init x86_add_irq_domains(void) +{ + struct device_node *dp; + + if (!initial_boot_params) + return; + + for_each_node_with_property(dp, "interrupt-controller") { + if (of_device_is_compatible(dp, "intel,ce4100-ioapic")) + ioapic_add_ofnode(dp); + } +} +#else +void __init x86_add_irq_domains(void) { } +#endif diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 4cadf8688dbd..9f76f89f43a4 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -118,6 +118,12 @@ void __init init_IRQ(void) { int i; + /* + * We probably need a better place for this, but it works for + * now ... + */ + x86_add_irq_domains(); + /* * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15. * If these IRQ's are handled by legacy interrupt-controllers like PIC, From 1fa4163bdc199a0b80f9e333d718b3f65e901593 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 22 Feb 2011 21:07:45 +0100 Subject: [PATCH 457/706] x86: ce4100: Use OF to setup devices Use device tree information to setup IO_APIC configuration, interrupt routing, HPET and everything else which cannot be enumerated by other means. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Dirk Brandewie Acked-by: Grant Likely Cc: sodaville@linutronix.de Cc: devicetree-discuss@lists.ozlabs.org LKML-Reference: <1298405266-1624-11-git-send-email-bigeasy@linutronix.de> Signed-off-by: Thomas Gleixner --- arch/x86/platform/ce4100/ce4100.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c index d2c0d51a7178..b3436d344b41 100644 --- a/arch/x86/platform/ce4100/ce4100.c +++ b/arch/x86/platform/ce4100/ce4100.c @@ -15,21 +15,19 @@ #include #include +#include #include +#include #include +#include static int ce4100_i8042_detect(void) { return 0; } -static void __init sdv_find_smp_config(void) -{ -} - #ifdef CONFIG_SERIAL_8250 - static unsigned int mem_serial_in(struct uart_port *p, int offset) { offset = offset << p->regshift; @@ -118,6 +116,15 @@ static void __init sdv_arch_setup(void) sdv_serial_fixup(); } +#ifdef CONFIG_X86_IO_APIC +static void __cpuinit sdv_pci_init(void) +{ + x86_of_pci_init(); + /* We can't set this earlier, because we need to calibrate the timer */ + legacy_pic = &null_legacy_pic; +} +#endif + /* * CE4100 specific x86_init function overrides and early setup * calls. @@ -127,6 +134,11 @@ void __init x86_ce4100_early_setup(void) x86_init.oem.arch_setup = sdv_arch_setup; x86_platform.i8042_detect = ce4100_i8042_detect; x86_init.resources.probe_roms = x86_init_noop; - x86_init.mpparse.get_smp_config = x86_init_uint_noop; - x86_init.mpparse.find_smp_config = sdv_find_smp_config; + x86_init.mpparse.get_smp_config = x86_dtb_get_config; + x86_init.mpparse.find_smp_config = x86_dtb_find_config; + +#ifdef CONFIG_X86_IO_APIC + x86_init.pci.init_irq = sdv_pci_init; + x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc_nocheck; +#endif } From 3bcbaf6e08d8d82cde781997bd2c56dda87049b5 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 22 Feb 2011 21:07:46 +0100 Subject: [PATCH 458/706] rtc: cmos: Add OF bindings This allows to load the OF driver based informations from the device tree. Systems without BIOS may need to perform some initialization. PowerPC creates a PNP device from the OF information and performs this kind of initialization in their private PCI quirk. This looks more generic. This patch also avoids registering the platform RTC driver on X86 if we have a device tree blob. Otherwise we would setup the device based on the hardcoded information in arch/x86 rather than the device tree based one. [ tglx: Changed "int of_have_populated_dt()" to bool as recommended by Grant ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Dirk Brandewie Acked-by: Grant Likely Cc: sodaville@linutronix.de Cc: devicetree-discuss@lists.ozlabs.org Cc: rtc-linux@googlegroups.com Cc: Alessandro Zummo LKML-Reference: <1298405266-1624-12-git-send-email-bigeasy@linutronix.de> Signed-off-by: Thomas Gleixner --- .../devicetree/bindings/rtc/rtc-cmos.txt | 28 ++++++++++++ arch/x86/kernel/rtc.c | 3 ++ drivers/rtc/rtc-cmos.c | 45 +++++++++++++++++++ include/linux/of.h | 12 +++++ 4 files changed, 88 insertions(+) create mode 100644 Documentation/devicetree/bindings/rtc/rtc-cmos.txt diff --git a/Documentation/devicetree/bindings/rtc/rtc-cmos.txt b/Documentation/devicetree/bindings/rtc/rtc-cmos.txt new file mode 100644 index 000000000000..7382989b3052 --- /dev/null +++ b/Documentation/devicetree/bindings/rtc/rtc-cmos.txt @@ -0,0 +1,28 @@ + Motorola mc146818 compatible RTC +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Required properties: + - compatible : "motorola,mc146818" + - reg : should contain registers location and length. + +Optional properties: + - interrupts : should contain interrupt. + - interrupt-parent : interrupt source phandle. + - ctrl-reg : Contains the initial value of the control register also + called "Register B". + - freq-reg : Contains the initial value of the frequency register also + called "Regsiter A". + +"Register A" and "B" are usually initialized by the firmware (BIOS for +instance). If this is not done, it can be performed by the driver. + +ISA Example: + + rtc@70 { + compatible = "motorola,mc146818"; + interrupts = <8 3>; + interrupt-parent = <&ioapic1>; + ctrl-reg = <2>; + freq-reg = <0x26>; + reg = <1 0x70 2>; + }; diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 6f39cab052d5..3f2ad2640d85 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -236,6 +237,8 @@ static __init int add_rtc_cmos(void) } } #endif + if (of_have_populated_dt()) + return 0; platform_device_register(&rtc_device); dev_info(&rtc_device.dev, diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index c7ff8df347e7..159b95e4b420 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -37,6 +37,8 @@ #include #include #include +#include +#include /* this is for "generic access to PC-style RTC" using CMOS_READ/CMOS_WRITE */ #include @@ -1123,6 +1125,47 @@ static struct pnp_driver cmos_pnp_driver = { #endif /* CONFIG_PNP */ +#ifdef CONFIG_OF +static const struct of_device_id of_cmos_match[] = { + { + .compatible = "motorola,mc146818", + }, + { }, +}; +MODULE_DEVICE_TABLE(of, of_cmos_match); + +static __init void cmos_of_init(struct platform_device *pdev) +{ + struct device_node *node = pdev->dev.of_node; + struct rtc_time time; + int ret; + const __be32 *val; + + if (!node) + return; + + val = of_get_property(node, "ctrl-reg", NULL); + if (val) + CMOS_WRITE(be32_to_cpup(val), RTC_CONTROL); + + val = of_get_property(node, "freq-reg", NULL); + if (val) + CMOS_WRITE(be32_to_cpup(val), RTC_FREQ_SELECT); + + get_rtc_time(&time); + ret = rtc_valid_tm(&time); + if (ret) { + struct rtc_time def_time = { + .tm_year = 1, + .tm_mday = 1, + }; + set_rtc_time(&def_time); + } +} +#else +static inline void cmos_of_init(struct platform_device *pdev) {} +#define of_cmos_match NULL +#endif /*----------------------------------------------------------------*/ /* Platform setup should have set up an RTC device, when PNP is @@ -1131,6 +1174,7 @@ static struct pnp_driver cmos_pnp_driver = { static int __init cmos_platform_probe(struct platform_device *pdev) { + cmos_of_init(pdev); cmos_wake_setup(&pdev->dev); return cmos_do_probe(&pdev->dev, platform_get_resource(pdev, IORESOURCE_IO, 0), @@ -1162,6 +1206,7 @@ static struct platform_driver cmos_platform_driver = { #ifdef CONFIG_PM .pm = &cmos_pm_ops, #endif + .of_match_table = of_cmos_match, } }; diff --git a/include/linux/of.h b/include/linux/of.h index d9dd664a6a9c..266db1d0baa9 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -70,6 +70,11 @@ extern struct device_node *allnodes; extern struct device_node *of_chosen; extern rwlock_t devtree_lock; +static inline bool of_have_populated_dt(void) +{ + return allnodes != NULL; +} + static inline bool of_node_is_root(const struct device_node *node) { return node && (node->parent == NULL); @@ -222,5 +227,12 @@ extern void of_attach_node(struct device_node *); extern void of_detach_node(struct device_node *); #endif +#else + +static inline bool of_have_populated_dt(void) +{ + return false; +} + #endif /* CONFIG_OF */ #endif /* _LINUX_OF_H */ From 4a66b1d95ad8baf6ab884a1c64461449b463eb78 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 24 Feb 2011 09:52:42 +0100 Subject: [PATCH 459/706] x86: dt: Fix OLPC=y/INTEL_CE=n build Both OLPC and CE4100 activate CONFIG_OF. OLPC uses PROMTREE while CE uses FLATTREE. Compiling for OLPC only breaks due to missing flat tree functions and variables. Use proper wrappers and provide an empty x86_flattree_get_config() inline so OF=y FLATTREE=n builds and works. [ tglx: Make it work with HPET_TIMER=n and make a function static ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/prom.h | 1 - arch/x86/kernel/devicetree.c | 23 +++++++++++++++++++---- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h index ae50131ad766..f58361ba7357 100644 --- a/arch/x86/include/asm/prom.h +++ b/arch/x86/include/asm/prom.h @@ -30,7 +30,6 @@ extern void add_dtb(u64 data); extern void x86_add_irq_domains(void); void x86_dtb_find_config(void); void x86_dtb_get_config(unsigned int unused); -void add_interrupt_host(struct irq_domain *ih); void __cpuinit x86_of_pci_init(void); static inline struct device_node *pci_device_to_OF_node(struct pci_dev *pdev) diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 2d65897a39d0..06e5e91939c5 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -26,7 +26,7 @@ static DEFINE_RAW_SPINLOCK(big_irq_lock); int __initdata of_ioapic; -void add_interrupt_host(struct irq_domain *ih) +static void add_interrupt_host(struct irq_domain *ih) { unsigned long flags; @@ -115,7 +115,7 @@ static struct of_device_id __initdata ce4100_ids[] = { static int __init add_bus_probe(void) { - if (!initial_boot_params) + if (!of_have_populated_dt()) return 0; return of_platform_bus_probe(NULL, ce4100_ids, NULL); @@ -203,6 +203,7 @@ void __cpuinit x86_of_pci_init(void) static void __init dtb_setup_hpet(void) { +#ifdef CONFIG_HPET_TIMER struct device_node *dn; struct resource r; int ret; @@ -216,6 +217,7 @@ static void __init dtb_setup_hpet(void) return; } hpet_address = r.start; +#endif } static void __init dtb_lapic_setup(void) @@ -288,7 +290,8 @@ void __init x86_dtb_find_config(void) printk(KERN_ERR "Missing device tree!.\n"); } -void __init x86_dtb_get_config(unsigned int unused) +#ifdef CONFIG_OF_FLATTREE +static void __init x86_flattree_get_config(void) { u32 size, map_len; void *new_dtb; @@ -317,6 +320,18 @@ void __init x86_dtb_get_config(unsigned int unused) of_scan_flat_dt(early_init_dt_scan_root, NULL); unflatten_device_tree(); +} +#else +static inline void x86_flattree_get_config(void) { } +#endif + +void __init x86_dtb_get_config(unsigned int unused) +{ + x86_flattree_get_config(); + + if (!of_have_populated_dt()) + return; + dtb_setup_hpet(); dtb_apic_setup(); } @@ -413,7 +428,7 @@ void __init x86_add_irq_domains(void) { struct device_node *dp; - if (!initial_boot_params) + if (!of_have_populated_dt()) return; for_each_node_with_property(dp, "interrupt-controller") { From 0932587328d9bd5b500a640fbaff3290c8d4cabf Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Feb 2011 14:43:05 +0100 Subject: [PATCH 460/706] bootmem: Separate out CONFIG_NO_BOOTMEM code into nobootmem.c mm/bootmem.c contained code paths for both bootmem and no bootmem configurations. They implement about the same set of APIs in different ways and as a result bootmem.c contains massive amount of #ifdef CONFIG_NO_BOOTMEM. Separate out CONFIG_NO_BOOTMEM code into mm/nobootmem.c. As the common part is relatively small, duplicate them in nobootmem.c instead of creating a common file or ifdef'ing in bootmem.c. The followings are duplicated. * {min|max}_low_pfn, max_pfn, saved_max_pfn * free_bootmem_late() * ___alloc_bootmem() * __alloc_bootmem_low() The followings are applicable only to nobootmem and moved verbatim. * __free_pages_memory() * free_all_memory_core_early() The followings are not applicable to nobootmem and omitted in nobootmem.c. * reserve_bootmem_node() * reserve_bootmem() The rest split function bodies according to CONFIG_NO_BOOTMEM. Makefile is updated so that only either bootmem.c or nobootmem.c is built according to CONFIG_NO_BOOTMEM. This patch doesn't introduce any behavior change. -tj: Rewrote commit description. Suggested-by: Ingo Molnar Signed-off-by: Yinghai Lu Acked-by: Andrew Morton Signed-off-by: Tejun Heo --- mm/Makefile | 8 +- mm/bootmem.c | 173 +-------------------- mm/nobootmem.c | 405 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 415 insertions(+), 171 deletions(-) create mode 100644 mm/nobootmem.c diff --git a/mm/Makefile b/mm/Makefile index 2b1b575ae712..42a8326c3e3d 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -7,7 +7,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ vmalloc.o pagewalk.o pgtable-generic.o -obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ +obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ @@ -15,6 +15,12 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ $(mmu-y) obj-y += init-mm.o +ifdef CONFIG_NO_BOOTMEM + obj-y += nobootmem.o +else + obj-y += bootmem.o +endif + obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o obj-$(CONFIG_BOUNCE) += bounce.o diff --git a/mm/bootmem.c b/mm/bootmem.c index 13b0caa9793c..4403e2fbc13d 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -35,7 +35,6 @@ unsigned long max_pfn; unsigned long saved_max_pfn; #endif -#ifndef CONFIG_NO_BOOTMEM bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); @@ -146,7 +145,7 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) min_low_pfn = start; return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); } -#endif + /* * free_bootmem_late - free bootmem pages directly to page allocator * @addr: starting address of the range @@ -171,53 +170,6 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) } } -#ifdef CONFIG_NO_BOOTMEM -static void __init __free_pages_memory(unsigned long start, unsigned long end) -{ - int i; - unsigned long start_aligned, end_aligned; - int order = ilog2(BITS_PER_LONG); - - start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); - end_aligned = end & ~(BITS_PER_LONG - 1); - - if (end_aligned <= start_aligned) { - for (i = start; i < end; i++) - __free_pages_bootmem(pfn_to_page(i), 0); - - return; - } - - for (i = start; i < start_aligned; i++) - __free_pages_bootmem(pfn_to_page(i), 0); - - for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG) - __free_pages_bootmem(pfn_to_page(i), order); - - for (i = end_aligned; i < end; i++) - __free_pages_bootmem(pfn_to_page(i), 0); -} - -unsigned long __init free_all_memory_core_early(int nodeid) -{ - int i; - u64 start, end; - unsigned long count = 0; - struct range *range = NULL; - int nr_range; - - nr_range = get_free_all_memory_range(&range, nodeid); - - for (i = 0; i < nr_range; i++) { - start = range[i].start; - end = range[i].end; - count += end - start; - __free_pages_memory(start, end); - } - - return count; -} -#else static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) { int aligned; @@ -278,7 +230,6 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) return count; } -#endif /** * free_all_bootmem_node - release a node's free pages to the buddy allocator @@ -289,12 +240,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) { register_page_bootmem_info_node(pgdat); -#ifdef CONFIG_NO_BOOTMEM - /* free_all_memory_core_early(MAX_NUMNODES) will be called later */ - return 0; -#else return free_all_bootmem_core(pgdat->bdata); -#endif } /** @@ -304,16 +250,6 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) */ unsigned long __init free_all_bootmem(void) { -#ifdef CONFIG_NO_BOOTMEM - /* - * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id - * because in some case like Node0 doesnt have RAM installed - * low ram will be on Node1 - * Use MAX_NUMNODES will make sure all ranges in early_node_map[] - * will be used instead of only Node0 related - */ - return free_all_memory_core_early(MAX_NUMNODES); -#else unsigned long total_pages = 0; bootmem_data_t *bdata; @@ -321,10 +257,8 @@ unsigned long __init free_all_bootmem(void) total_pages += free_all_bootmem_core(bdata); return total_pages; -#endif } -#ifndef CONFIG_NO_BOOTMEM static void __init __free(bootmem_data_t *bdata, unsigned long sidx, unsigned long eidx) { @@ -419,7 +353,6 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, } BUG(); } -#endif /** * free_bootmem_node - mark a page range as usable @@ -434,10 +367,6 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size) { -#ifdef CONFIG_NO_BOOTMEM - kmemleak_free_part(__va(physaddr), size); - memblock_x86_free_range(physaddr, physaddr + size); -#else unsigned long start, end; kmemleak_free_part(__va(physaddr), size); @@ -446,7 +375,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, end = PFN_DOWN(physaddr + size); mark_bootmem_node(pgdat->bdata, start, end, 0, 0); -#endif } /** @@ -460,10 +388,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, */ void __init free_bootmem(unsigned long addr, unsigned long size) { -#ifdef CONFIG_NO_BOOTMEM - kmemleak_free_part(__va(addr), size); - memblock_x86_free_range(addr, addr + size); -#else unsigned long start, end; kmemleak_free_part(__va(addr), size); @@ -472,7 +396,6 @@ void __init free_bootmem(unsigned long addr, unsigned long size) end = PFN_DOWN(addr + size); mark_bootmem(start, end, 0, 0); -#endif } /** @@ -489,17 +412,12 @@ void __init free_bootmem(unsigned long addr, unsigned long size) int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size, int flags) { -#ifdef CONFIG_NO_BOOTMEM - panic("no bootmem"); - return 0; -#else unsigned long start, end; start = PFN_DOWN(physaddr); end = PFN_UP(physaddr + size); return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); -#endif } /** @@ -515,20 +433,14 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, int __init reserve_bootmem(unsigned long addr, unsigned long size, int flags) { -#ifdef CONFIG_NO_BOOTMEM - panic("no bootmem"); - return 0; -#else unsigned long start, end; start = PFN_DOWN(addr); end = PFN_UP(addr + size); return mark_bootmem(start, end, 1, flags); -#endif } -#ifndef CONFIG_NO_BOOTMEM int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len, int flags) { @@ -685,33 +597,12 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, #endif return NULL; } -#endif static void * __init ___alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { -#ifdef CONFIG_NO_BOOTMEM - void *ptr; - - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc(size, GFP_NOWAIT); - -restart: - - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); - - if (ptr) - return ptr; - - if (goal != 0) { - goal = 0; - goto restart; - } - - return NULL; -#else bootmem_data_t *bdata; void *region; @@ -737,7 +628,6 @@ restart: } return NULL; -#endif } /** @@ -758,10 +648,6 @@ void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, { unsigned long limit = 0; -#ifdef CONFIG_NO_BOOTMEM - limit = -1UL; -#endif - return ___alloc_bootmem_nopanic(size, align, goal, limit); } @@ -798,14 +684,9 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, { unsigned long limit = 0; -#ifdef CONFIG_NO_BOOTMEM - limit = -1UL; -#endif - return ___alloc_bootmem(size, align, goal, limit); } -#ifndef CONFIG_NO_BOOTMEM static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) @@ -822,7 +703,6 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, return ___alloc_bootmem(size, align, goal, limit); } -#endif /** * __alloc_bootmem_node - allocate boot memory from a specific node @@ -842,24 +722,10 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { - void *ptr; - if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); -#ifdef CONFIG_NO_BOOTMEM - ptr = __alloc_memory_core_early(pgdat->node_id, size, align, - goal, -1ULL); - if (ptr) - return ptr; - - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, - goal, -1ULL); -#else - ptr = ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); -#endif - - return ptr; + return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); } void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, @@ -880,13 +746,8 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, unsigned long new_goal; new_goal = MAX_DMA32_PFN << PAGE_SHIFT; -#ifdef CONFIG_NO_BOOTMEM - ptr = __alloc_memory_core_early(pgdat->node_id, size, align, - new_goal, -1ULL); -#else ptr = alloc_bootmem_core(pgdat->bdata, size, align, new_goal, 0); -#endif if (ptr) return ptr; } @@ -907,16 +768,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, void * __init alloc_bootmem_section(unsigned long size, unsigned long section_nr) { -#ifdef CONFIG_NO_BOOTMEM - unsigned long pfn, goal, limit; - - pfn = section_nr_to_pfn(section_nr); - goal = pfn << PAGE_SHIFT; - limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; - - return __alloc_memory_core_early(early_pfn_to_nid(pfn), size, - SMP_CACHE_BYTES, goal, limit); -#else bootmem_data_t *bdata; unsigned long pfn, goal, limit; @@ -926,7 +777,6 @@ void * __init alloc_bootmem_section(unsigned long size, bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); -#endif } #endif @@ -938,16 +788,11 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); -#ifdef CONFIG_NO_BOOTMEM - ptr = __alloc_memory_core_early(pgdat->node_id, size, align, - goal, -1ULL); -#else ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); if (ptr) return ptr; ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); -#endif if (ptr) return ptr; @@ -995,21 +840,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { - void *ptr; - if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); -#ifdef CONFIG_NO_BOOTMEM - ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, ARCH_LOW_ADDRESS_LIMIT); - if (ptr) - return ptr; - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, - goal, ARCH_LOW_ADDRESS_LIMIT); -#else - ptr = ___alloc_bootmem_node(pgdat->bdata, size, align, - goal, ARCH_LOW_ADDRESS_LIMIT); -#endif - return ptr; } diff --git a/mm/nobootmem.c b/mm/nobootmem.c new file mode 100644 index 000000000000..f220b8d0a97d --- /dev/null +++ b/mm/nobootmem.c @@ -0,0 +1,405 @@ +/* + * bootmem - A boot-time physical memory allocator and configurator + * + * Copyright (C) 1999 Ingo Molnar + * 1999 Kanoj Sarcar, SGI + * 2008 Johannes Weiner + * + * Access to this subsystem has to be serialized externally (which is true + * for the boot process anyway). + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "internal.h" + +unsigned long max_low_pfn; +unsigned long min_low_pfn; +unsigned long max_pfn; + +#ifdef CONFIG_CRASH_DUMP +/* + * If we have booted due to a crash, max_pfn will be a very low value. We need + * to know the amount of memory that the previous kernel used. + */ +unsigned long saved_max_pfn; +#endif + +/* + * free_bootmem_late - free bootmem pages directly to page allocator + * @addr: starting address of the range + * @size: size of the range in bytes + * + * This is only useful when the bootmem allocator has already been torn + * down, but we are still initializing the system. Pages are given directly + * to the page allocator, no bootmem metadata is updated because it is gone. + */ +void __init free_bootmem_late(unsigned long addr, unsigned long size) +{ + unsigned long cursor, end; + + kmemleak_free_part(__va(addr), size); + + cursor = PFN_UP(addr); + end = PFN_DOWN(addr + size); + + for (; cursor < end; cursor++) { + __free_pages_bootmem(pfn_to_page(cursor), 0); + totalram_pages++; + } +} + +static void __init __free_pages_memory(unsigned long start, unsigned long end) +{ + int i; + unsigned long start_aligned, end_aligned; + int order = ilog2(BITS_PER_LONG); + + start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); + end_aligned = end & ~(BITS_PER_LONG - 1); + + if (end_aligned <= start_aligned) { + for (i = start; i < end; i++) + __free_pages_bootmem(pfn_to_page(i), 0); + + return; + } + + for (i = start; i < start_aligned; i++) + __free_pages_bootmem(pfn_to_page(i), 0); + + for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG) + __free_pages_bootmem(pfn_to_page(i), order); + + for (i = end_aligned; i < end; i++) + __free_pages_bootmem(pfn_to_page(i), 0); +} + +unsigned long __init free_all_memory_core_early(int nodeid) +{ + int i; + u64 start, end; + unsigned long count = 0; + struct range *range = NULL; + int nr_range; + + nr_range = get_free_all_memory_range(&range, nodeid); + + for (i = 0; i < nr_range; i++) { + start = range[i].start; + end = range[i].end; + count += end - start; + __free_pages_memory(start, end); + } + + return count; +} + +/** + * free_all_bootmem_node - release a node's free pages to the buddy allocator + * @pgdat: node to be released + * + * Returns the number of pages actually released. + */ +unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) +{ + register_page_bootmem_info_node(pgdat); + + /* free_all_memory_core_early(MAX_NUMNODES) will be called later */ + return 0; +} + +/** + * free_all_bootmem - release free pages to the buddy allocator + * + * Returns the number of pages actually released. + */ +unsigned long __init free_all_bootmem(void) +{ + /* + * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id + * because in some case like Node0 doesnt have RAM installed + * low ram will be on Node1 + * Use MAX_NUMNODES will make sure all ranges in early_node_map[] + * will be used instead of only Node0 related + */ + return free_all_memory_core_early(MAX_NUMNODES); +} + +/** + * free_bootmem_node - mark a page range as usable + * @pgdat: node the range resides on + * @physaddr: starting address of the range + * @size: size of the range in bytes + * + * Partial pages will be considered reserved and left as they are. + * + * The range must reside completely on the specified node. + */ +void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, + unsigned long size) +{ + kmemleak_free_part(__va(physaddr), size); + memblock_x86_free_range(physaddr, physaddr + size); +} + +/** + * free_bootmem - mark a page range as usable + * @addr: starting address of the range + * @size: size of the range in bytes + * + * Partial pages will be considered reserved and left as they are. + * + * The range must be contiguous but may span node boundaries. + */ +void __init free_bootmem(unsigned long addr, unsigned long size) +{ + kmemleak_free_part(__va(addr), size); + memblock_x86_free_range(addr, addr + size); +} + +static void * __init ___alloc_bootmem_nopanic(unsigned long size, + unsigned long align, + unsigned long goal, + unsigned long limit) +{ + void *ptr; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc(size, GFP_NOWAIT); + +restart: + + ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); + + if (ptr) + return ptr; + + if (goal != 0) { + goal = 0; + goto restart; + } + + return NULL; +} + +/** + * __alloc_bootmem_nopanic - allocate boot memory without panicking + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may happen on any node in the system. + * + * Returns NULL on failure. + */ +void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, + unsigned long goal) +{ + unsigned long limit = -1UL; + + return ___alloc_bootmem_nopanic(size, align, goal, limit); +} + +static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, + unsigned long goal, unsigned long limit) +{ + void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit); + + if (mem) + return mem; + /* + * Whoops, we cannot satisfy the allocation request. + */ + printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + panic("Out of memory"); + return NULL; +} + +/** + * __alloc_bootmem - allocate boot memory + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may happen on any node in the system. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem(unsigned long size, unsigned long align, + unsigned long goal) +{ + unsigned long limit = -1UL; + + return ___alloc_bootmem(size, align, goal, limit); +} + +/** + * __alloc_bootmem_node - allocate boot memory from a specific node + * @pgdat: node to allocate from + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may fall back to any node in the system if the specified node + * can not hold the requested memory. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + void *ptr; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + goal, -1ULL); + if (ptr) + return ptr; + + return __alloc_memory_core_early(MAX_NUMNODES, size, align, + goal, -1ULL); +} + +void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ +#ifdef MAX_DMA32_PFN + unsigned long end_pfn; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + /* update goal according ...MAX_DMA32_PFN */ + end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages; + + if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) && + (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) { + void *ptr; + unsigned long new_goal; + + new_goal = MAX_DMA32_PFN << PAGE_SHIFT; + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + new_goal, -1ULL); + if (ptr) + return ptr; + } +#endif + + return __alloc_bootmem_node(pgdat, size, align, goal); + +} + +#ifdef CONFIG_SPARSEMEM +/** + * alloc_bootmem_section - allocate boot memory from a specific section + * @size: size of the request in bytes + * @section_nr: sparse map section to allocate from + * + * Return NULL on failure. + */ +void * __init alloc_bootmem_section(unsigned long size, + unsigned long section_nr) +{ + unsigned long pfn, goal, limit; + + pfn = section_nr_to_pfn(section_nr); + goal = pfn << PAGE_SHIFT; + limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; + + return __alloc_memory_core_early(early_pfn_to_nid(pfn), size, + SMP_CACHE_BYTES, goal, limit); +} +#endif + +void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + void *ptr; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + goal, -1ULL); + if (ptr) + return ptr; + + return __alloc_bootmem_nopanic(size, align, goal); +} + +#ifndef ARCH_LOW_ADDRESS_LIMIT +#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL +#endif + +/** + * __alloc_bootmem_low - allocate low boot memory + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may happen on any node in the system. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, + unsigned long goal) +{ + return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); +} + +/** + * __alloc_bootmem_low_node - allocate low boot memory from a specific node + * @pgdat: node to allocate from + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may fall back to any node in the system if the specified node + * can not hold the requested memory. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + void *ptr; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + goal, ARCH_LOW_ADDRESS_LIMIT); + if (ptr) + return ptr; + + return __alloc_memory_core_early(MAX_NUMNODES, size, align, + goal, ARCH_LOW_ADDRESS_LIMIT); +} From e782ab421bbba1912c87934bd0e8998630736418 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Feb 2011 14:43:06 +0100 Subject: [PATCH 461/706] bootmem: Move contig_page_data definition to bootmem.c/nobootmem.c Now that bootmem.c and nobootmem.c are separate, it's cleaner to define contig_page_data in each file than in page_alloc.c with #ifdef. Move it. This patch doesn't introduce any behavior change. -v2: According to Andrew, fixed the struct layout. -tj: Updated commit description. Signed-off-by: Yinghai Lu Acked-by: Andrew Morton Signed-off-by: Tejun Heo --- mm/bootmem.c | 7 +++++++ mm/nobootmem.c | 5 +++++ mm/page_alloc.c | 9 --------- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/mm/bootmem.c b/mm/bootmem.c index 4403e2fbc13d..07aeb89e396e 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -23,6 +23,13 @@ #include "internal.h" +#ifndef CONFIG_NEED_MULTIPLE_NODES +struct pglist_data __refdata contig_page_data = { + .bdata = &bootmem_node_data[0] +}; +EXPORT_SYMBOL(contig_page_data); +#endif + unsigned long max_low_pfn; unsigned long min_low_pfn; unsigned long max_pfn; diff --git a/mm/nobootmem.c b/mm/nobootmem.c index f220b8d0a97d..6a018e49b7be 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -23,6 +23,11 @@ #include "internal.h" +#ifndef CONFIG_NEED_MULTIPLE_NODES +struct pglist_data __refdata contig_page_data; +EXPORT_SYMBOL(contig_page_data); +#endif + unsigned long max_low_pfn; unsigned long min_low_pfn; unsigned long max_pfn; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 887ce3bd823d..a243a7fd6922 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4841,15 +4841,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) dma_reserve = new_dma_reserve; } -#ifndef CONFIG_NEED_MULTIPLE_NODES -struct pglist_data __refdata contig_page_data = { -#ifndef CONFIG_NO_BOOTMEM - .bdata = &bootmem_node_data[0] -#endif - }; -EXPORT_SYMBOL(contig_page_data); -#endif - void __init free_area_init(unsigned long *zones_size) { free_area_init_node(0, zones_size, From 8bc1f91e1f0e977fb95b11d8fa686f5091888110 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Feb 2011 14:43:06 +0100 Subject: [PATCH 462/706] bootmem: Move __alloc_memory_core_early() to nobootmem.c Now that bootmem.c and nobootmem.c are separate, there's no reason to define __alloc_memory_core_early(), which is used only by nobootmem, inside #ifdef in page_alloc.c. Move it to nobootmem.c and make it static. This patch doesn't introduce any behavior change. -tj: Updated commit description. Signed-off-by: Yinghai Lu Acked-by: Andrew Morton Signed-off-by: Tejun Heo --- include/linux/mm.h | 2 -- mm/nobootmem.c | 25 +++++++++++++++++++++++++ mm/page_alloc.c | 28 ---------------------------- 3 files changed, 25 insertions(+), 30 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index f6385fc17ad4..679300c050f5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1309,8 +1309,6 @@ int add_from_early_node_map(struct range *range, int az, int nr_range, int nid); u64 __init find_memory_core_early(int nid, u64 size, u64 align, u64 goal, u64 limit); -void *__alloc_memory_core_early(int nodeid, u64 size, u64 align, - u64 goal, u64 limit); typedef int (*work_fn_t)(unsigned long, unsigned long, void *); extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data); extern void sparse_memory_present_with_active_regions(int nid); diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 6a018e49b7be..e2bdb07079ce 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -40,6 +40,31 @@ unsigned long max_pfn; unsigned long saved_max_pfn; #endif +static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, + u64 goal, u64 limit) +{ + void *ptr; + u64 addr; + + if (limit > memblock.current_limit) + limit = memblock.current_limit; + + addr = find_memory_core_early(nid, size, align, goal, limit); + + if (addr == MEMBLOCK_ERROR) + return NULL; + + ptr = phys_to_virt(addr); + memset(ptr, 0, size); + memblock_x86_reserve_range(addr, addr + size, "BOOTMEM"); + /* + * The min_count is set to 0 so that bootmem allocated blocks + * are never reported as leaks. + */ + kmemleak_alloc(ptr, size, 0, 0); + return ptr; +} + /* * free_bootmem_late - free bootmem pages directly to page allocator * @addr: starting address of the range diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a243a7fd6922..603513609ba5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3780,34 +3780,6 @@ int __init add_from_early_node_map(struct range *range, int az, return nr_range; } -#ifdef CONFIG_NO_BOOTMEM -void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, - u64 goal, u64 limit) -{ - void *ptr; - u64 addr; - - if (limit > memblock.current_limit) - limit = memblock.current_limit; - - addr = find_memory_core_early(nid, size, align, goal, limit); - - if (addr == MEMBLOCK_ERROR) - return NULL; - - ptr = phys_to_virt(addr); - memset(ptr, 0, size); - memblock_x86_reserve_range(addr, addr + size, "BOOTMEM"); - /* - * The min_count is set to 0 so that bootmem allocated blocks - * are never reported as leaks. - */ - kmemleak_alloc(ptr, size, 0, 0); - return ptr; -} -#endif - - void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) { int i; From d1b19426b04787e48f2689923e28d37b488969b0 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Feb 2011 14:46:24 +0100 Subject: [PATCH 463/706] x86: Rename e820_table_* to pgt_buf_* e820_table_{start|end|top}, which are used to buffer page table allocation during early boot, are now derived from memblock and don't have much to do with e820. Change the names so that they reflect what they're used for. This patch doesn't introduce any behavior change. -v2: Ingo found that earlier patch "x86: Use early pre-allocated page table buffer top-down" caused crash on 32bit and needed to be dropped. This patch was updated to reflect the change. -tj: Updated commit description. Signed-off-by: Yinghai Lu Signed-off-by: Tejun Heo --- arch/x86/include/asm/init.h | 6 +++--- arch/x86/mm/init.c | 20 ++++++++++---------- arch/x86/mm/init_32.c | 8 ++++---- arch/x86/mm/init_64.c | 4 ++-- arch/x86/xen/mmu.c | 2 +- 5 files changed, 20 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 36fb1a6a5109..8dbe353e41e1 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -11,8 +11,8 @@ kernel_physical_mapping_init(unsigned long start, unsigned long page_size_mask); -extern unsigned long __initdata e820_table_start; -extern unsigned long __meminitdata e820_table_end; -extern unsigned long __meminitdata e820_table_top; +extern unsigned long __initdata pgt_buf_start; +extern unsigned long __meminitdata pgt_buf_end; +extern unsigned long __meminitdata pgt_buf_top; #endif /* _ASM_X86_INIT_32_H */ diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index b8054e087ead..286d289b039b 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -18,9 +18,9 @@ DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); -unsigned long __initdata e820_table_start; -unsigned long __meminitdata e820_table_end; -unsigned long __meminitdata e820_table_top; +unsigned long __initdata pgt_buf_start; +unsigned long __meminitdata pgt_buf_end; +unsigned long __meminitdata pgt_buf_top; int after_bootmem; @@ -73,12 +73,12 @@ static void __init find_early_table_space(unsigned long end, int use_pse, if (base == MEMBLOCK_ERROR) panic("Cannot find space for the kernel page tables"); - e820_table_start = base >> PAGE_SHIFT; - e820_table_end = e820_table_start; - e820_table_top = e820_table_start + (tables >> PAGE_SHIFT); + pgt_buf_start = base >> PAGE_SHIFT; + pgt_buf_end = pgt_buf_start; + pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", - end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT); + end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT); } struct map_range { @@ -272,9 +272,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, __flush_tlb_all(); - if (!after_bootmem && e820_table_end > e820_table_start) - memblock_x86_reserve_range(e820_table_start << PAGE_SHIFT, - e820_table_end << PAGE_SHIFT, "PGTABLE"); + if (!after_bootmem && pgt_buf_end > pgt_buf_start) + memblock_x86_reserve_range(pgt_buf_start << PAGE_SHIFT, + pgt_buf_end << PAGE_SHIFT, "PGTABLE"); if (!after_bootmem) early_memtest(start, end); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 5d43fa5141c6..73ad7ebd6e9c 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -62,10 +62,10 @@ bool __read_mostly __vmalloc_start_set = false; static __init void *alloc_low_page(void) { - unsigned long pfn = e820_table_end++; + unsigned long pfn = pgt_buf_end++; void *adr; - if (pfn >= e820_table_top) + if (pfn >= pgt_buf_top) panic("alloc_low_page: ran out of memory"); adr = __va(pfn * PAGE_SIZE); @@ -163,8 +163,8 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, if (pmd_idx_kmap_begin != pmd_idx_kmap_end && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end - && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start - || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) { + && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start + || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) { pte_t *newpte; int i; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 4f1f461fc1e9..470cc4704a9a 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -314,7 +314,7 @@ void __init cleanup_highmap(void) static __ref void *alloc_low_page(unsigned long *phys) { - unsigned long pfn = e820_table_end++; + unsigned long pfn = pgt_buf_end++; void *adr; if (after_bootmem) { @@ -324,7 +324,7 @@ static __ref void *alloc_low_page(unsigned long *phys) return adr; } - if (pfn >= e820_table_top) + if (pfn >= pgt_buf_top) panic("alloc_low_page: ran out of memory"); adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 5e92b61ad574..13783a18c6f1 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1443,7 +1443,7 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) * early_ioremap fixmap slot, make sure it is RO. */ if (!is_early_ioremap_ptep(ptep) && - pfn >= e820_table_start && pfn < e820_table_end) + pfn >= pgt_buf_start && pfn < pgt_buf_end) pte = pte_wrprotect(pte); return pte; From 1f565a896ee139a70e1a16f74a4ec29707691b0b Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Fri, 25 Feb 2011 10:06:39 +0100 Subject: [PATCH 464/706] x86-64, NUMA: Fix size of numa_distance array numa_distance should be sized like the SLIT, an NxN matrix where N is the highest node id + 1. This patch fixes the calculation to avoid overflowing the array on the subsequent iteration. -tj: The original patch used last index to calculate size. Yinghai pointed out it should be incremented so it is the number of elements instead of the last index to calculate the size of the table. Updated accordingly. Signed-off-by: David Rientjes Cc: Yinghai Lu Signed-off-by: Tejun Heo --- arch/x86/mm/numa_64.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index cccc01d8415c..7757d2214fab 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -414,7 +414,8 @@ static int __init numa_alloc_distance(void) for_each_node_mask(i, nodes_parsed) cnt = i; - size = ++cnt * sizeof(numa_distance[0]); + cnt++; + size = cnt * cnt * sizeof(numa_distance[0]); phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT, size, PAGE_SIZE); From c16bfe9ac389b13a37ff617a09682ecc0685960f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 25 Feb 2011 09:30:29 -0300 Subject: [PATCH 465/706] perf top browser: Fix up exit keys The left key was exiting 'perf top --tui' when it really shouldn't, it was too easy to leave the live annotation window and then press one too many <- and get out of the tool altogether. Do just like the report TUI does, ignore the left key for exit and also ask the user when pressing ESC if that is really what is wanted. Reported-by: Mike Galbraith Suggested-by: Ingo Molnar Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/ui/browsers/top.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/perf/util/ui/browsers/top.c b/tools/perf/util/ui/browsers/top.c index 2f47224426b6..e9381ec9bfb1 100644 --- a/tools/perf/util/ui/browsers/top.c +++ b/tools/perf/util/ui/browsers/top.c @@ -10,6 +10,7 @@ #include "../../annotate.h" #include "../helpline.h" #include "../libslang.h" +#include "../util.h" #include "../../evlist.h" #include "../../hist.h" #include "../../sort.h" @@ -174,6 +175,12 @@ static int perf_top_browser__run(struct perf_top_browser *browser) if (browser->selection) perf_top_browser__annotate(browser); break; + case NEWT_KEY_LEFT: + continue; + case NEWT_KEY_ESCAPE: + if (!ui__dialog_yesno("Do you really want to exit?")) + continue; + /* Fall thru */ default: goto out; } From 7e9498705e810404ecf29bb2d6fa632b9484c609 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 25 Feb 2011 14:32:28 +0100 Subject: [PATCH 466/706] sched: Add #ifdef around irq time accounting functions Get rid of this: kernel/sched.c:3731:13: warning: 'irqtime_account_idle_ticks' defined but not used kernel/sched.c:3732:13: warning: 'irqtime_account_process_tick' defined but not used Signed-off-by: Heiko Carstens Cc: Venkatesh Pallipadi Cc: Peter Zijlstra LKML-Reference: <20110225133228.GD7469@osiris.boeblingen.de.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched.c b/kernel/sched.c index 2effcb71a478..79bca166bed7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3687,6 +3687,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, __account_system_time(p, cputime, cputime_scaled, target_cputime64); } +#ifndef CONFIG_VIRT_CPU_ACCOUNTING #ifdef CONFIG_IRQ_TIME_ACCOUNTING /* * Account a tick to a process and cpustat @@ -3753,6 +3754,7 @@ static void irqtime_account_idle_ticks(int ticks) {} static void irqtime_account_process_tick(struct task_struct *p, int user_tick, struct rq *rq) {} #endif +#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ /* * Account for involuntary wait time. From b210b3bb1b002f27165325a5edb6ebce3c168e92 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 25 Feb 2011 11:33:31 -0300 Subject: [PATCH 467/706] perf ui browser: Introduce ui_browser__show_title Needed because we were only showing the title in ui_browser__show, not in ui_browser__run, and in the run loop we may be calling other browsers that would then change the title, when we go back to the previous browser, we need to redraw the title. We could have done this as the Newt help line, with pop, etc, but I don't think its worth, doing it explicitely, when needed (some browsers may not use the title area at all) seems enough/more flexible. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/ui/browser.c | 18 +++++++++++++++--- tools/perf/util/ui/browser.h | 3 ++- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/ui/browser.c b/tools/perf/util/ui/browser.c index 60d6c815e1db..611219f80680 100644 --- a/tools/perf/util/ui/browser.c +++ b/tools/perf/util/ui/browser.c @@ -157,6 +157,20 @@ void ui_browser__add_exit_keys(struct ui_browser *self, int keys[]) } } +void __ui_browser__show_title(struct ui_browser *browser, const char *title) +{ + SLsmg_gotorc(0, 0); + ui_browser__set_color(browser, NEWT_COLORSET_ROOT); + slsmg_write_nstring(title, browser->width); +} + +void ui_browser__show_title(struct ui_browser *browser, const char *title) +{ + pthread_mutex_lock(&ui__lock); + __ui_browser__show_title(browser, title); + pthread_mutex_unlock(&ui__lock); +} + int ui_browser__show(struct ui_browser *self, const char *title, const char *helpline, ...) { @@ -180,9 +194,7 @@ int ui_browser__show(struct ui_browser *self, const char *title, return -1; pthread_mutex_lock(&ui__lock); - SLsmg_gotorc(0, 0); - ui_browser__set_color(self, NEWT_COLORSET_ROOT); - slsmg_write_nstring(title, self->width); + __ui_browser__show_title(self, title); ui_browser__add_exit_keys(self, keys); newtFormAddComponent(self->form, self->sb); diff --git a/tools/perf/util/ui/browser.h b/tools/perf/util/ui/browser.h index 0dc7e4da36f5..fc63dda10910 100644 --- a/tools/perf/util/ui/browser.h +++ b/tools/perf/util/ui/browser.h @@ -24,7 +24,6 @@ struct ui_browser { u32 nr_entries; }; - void ui_browser__set_color(struct ui_browser *self, int color); void ui_browser__set_percent_color(struct ui_browser *self, double percent, bool current); @@ -35,6 +34,8 @@ void ui_browser__reset_index(struct ui_browser *self); void ui_browser__gotorc(struct ui_browser *self, int y, int x); void ui_browser__add_exit_key(struct ui_browser *self, int key); void ui_browser__add_exit_keys(struct ui_browser *self, int keys[]); +void __ui_browser__show_title(struct ui_browser *browser, const char *title); +void ui_browser__show_title(struct ui_browser *browser, const char *title); int ui_browser__show(struct ui_browser *self, const char *title, const char *helpline, ...); void ui_browser__hide(struct ui_browser *self); From a906fdaacca49917d83e5032dfc31f694249ad10 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Feb 2011 16:09:31 +0100 Subject: [PATCH 468/706] x86: dt: Cleanup local apic setup Up to now we force enable the local apic in the devicetree setup uncoditionally and set smp_found_config unconditionally to 1 when a devicetree blob is available. This breaks, when local apic is disabled in the Kconfig. Make it consistent by initializing device tree explicitely before smp_get_config() so a non lapic configuration could be used as well. To be functional that would require to implement PIT as an interrupt host, but the only user of this code until now is ce4100 which requires apics to be available. So we leave this up to those who need it. Tested-by: Sebastian Siewior Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/apic.h | 2 +- arch/x86/include/asm/prom.h | 6 ++--- arch/x86/kernel/apic/apic.c | 6 ++--- arch/x86/kernel/devicetree.c | 40 +++++++++++++++---------------- arch/x86/kernel/setup.c | 2 +- arch/x86/platform/ce4100/ce4100.c | 4 ++-- 6 files changed, 29 insertions(+), 31 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 3c896946f4cc..4afe512120a9 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -240,7 +240,7 @@ extern void setup_boot_APIC_clock(void); extern void setup_secondary_APIC_clock(void); extern int APIC_init_uniprocessor(void); extern void enable_NMI_through_LVT0(void); -extern int apic_force_enable(void); +extern int apic_force_enable(unsigned long addr); /* * On 32bit this is mach-xxx local diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h index f58361ba7357..971e0b46446e 100644 --- a/arch/x86/include/asm/prom.h +++ b/arch/x86/include/asm/prom.h @@ -28,9 +28,8 @@ extern int of_ioapic; extern u64 initial_dtb; extern void add_dtb(u64 data); extern void x86_add_irq_domains(void); -void x86_dtb_find_config(void); -void x86_dtb_get_config(unsigned int unused); void __cpuinit x86_of_pci_init(void); +void x86_dtb_init(void); static inline struct device_node *pci_device_to_OF_node(struct pci_dev *pdev) { @@ -46,8 +45,7 @@ static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus) static inline void add_dtb(u64 data) { } static inline void x86_add_irq_domains(void) { } static inline void x86_of_pci_init(void) { } -#define x86_dtb_find_config x86_init_noop -#define x86_dtb_get_config x86_init_uint_noop +static inline void x86_dtb_init(void) { } #define of_ioapic 0 #endif diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index f0e079823c43..4f43312cfbf8 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1563,7 +1563,7 @@ static int apic_verify(void) return 0; } -int apic_force_enable(void) +int apic_force_enable(unsigned long addr) { u32 h, l; @@ -1579,7 +1579,7 @@ int apic_force_enable(void) if (!(l & MSR_IA32_APICBASE_ENABLE)) { pr_info("Local APIC disabled by BIOS -- reenabling.\n"); l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; + l |= MSR_IA32_APICBASE_ENABLE | addr; wrmsr(MSR_IA32_APICBASE, l, h); enabled_via_apicbase = 1; } @@ -1620,7 +1620,7 @@ static int __init detect_init_APIC(void) "you can enable it with \"lapic\"\n"); return -1; } - if (apic_force_enable()) + if (apic_force_enable(APIC_DEFAULT_PHYS_BASE)) return -1; } else { if (apic_verify()) diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 06e5e91939c5..7a8cebc9ff29 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -26,6 +26,7 @@ static DEFINE_RAW_SPINLOCK(big_irq_lock); int __initdata of_ioapic; +#ifdef CONFIG_X86_IO_APIC static void add_interrupt_host(struct irq_domain *ih) { unsigned long flags; @@ -34,6 +35,7 @@ static void add_interrupt_host(struct irq_domain *ih) list_add(&ih->l, &irq_domains); raw_spin_unlock_irqrestore(&big_irq_lock, flags); } +#endif static struct irq_domain *get_ih_from_node(struct device_node *controller) { @@ -223,18 +225,28 @@ static void __init dtb_setup_hpet(void) static void __init dtb_lapic_setup(void) { #ifdef CONFIG_X86_LOCAL_APIC - if (apic_force_enable()) + struct device_node *dn; + struct resource r; + int ret; + + dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-lapic"); + if (!dn) return; + ret = of_address_to_resource(dn, 0, &r); + if (WARN_ON(ret)) + return; + + /* Did the boot loader setup the local APIC ? */ + if (!cpu_has_apic) { + if (apic_force_enable(r.start)) + return; + } smp_found_config = 1; pic_mode = 1; - /* Required for ioapic registration */ - set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); - if (boot_cpu_physical_apicid == -1U) - boot_cpu_physical_apicid = read_apic_id(); - + register_lapic_address(r.start); generic_processor_info(boot_cpu_physical_apicid, - GET_APIC_VERSION(apic_read(APIC_LVR))); + GET_APIC_VERSION(apic_read(APIC_LVR))); #endif } @@ -259,9 +271,6 @@ static void __init dtb_ioapic_setup(void) { struct device_node *dn; - if (!smp_found_config) - return; - for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic") dtb_add_ioapic(dn); @@ -270,7 +279,6 @@ static void __init dtb_ioapic_setup(void) return; } printk(KERN_ERR "Error: No information about IO-APIC in OF.\n"); - smp_found_config = 0; } #else static void __init dtb_ioapic_setup(void) {} @@ -282,14 +290,6 @@ static void __init dtb_apic_setup(void) dtb_ioapic_setup(); } -void __init x86_dtb_find_config(void) -{ - if (initial_dtb) - smp_found_config = 1; - else - printk(KERN_ERR "Missing device tree!.\n"); -} - #ifdef CONFIG_OF_FLATTREE static void __init x86_flattree_get_config(void) { @@ -325,7 +325,7 @@ static void __init x86_flattree_get_config(void) static inline void x86_flattree_get_config(void) { } #endif -void __init x86_dtb_get_config(unsigned int unused) +void __init x86_dtb_init(void) { x86_flattree_get_config(); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 33dcbce1ab0f..b3143bc74e6c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1044,8 +1044,8 @@ void __init setup_arch(char **cmdline_p) * Read APIC and some other early information from ACPI tables. */ acpi_boot_init(); - sfi_init(); + x86_dtb_init(); /* * get boot-time SMP configuration: diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c index b3436d344b41..68c0dbcc95be 100644 --- a/arch/x86/platform/ce4100/ce4100.c +++ b/arch/x86/platform/ce4100/ce4100.c @@ -134,8 +134,8 @@ void __init x86_ce4100_early_setup(void) x86_init.oem.arch_setup = sdv_arch_setup; x86_platform.i8042_detect = ce4100_i8042_detect; x86_init.resources.probe_roms = x86_init_noop; - x86_init.mpparse.get_smp_config = x86_dtb_get_config; - x86_init.mpparse.find_smp_config = x86_dtb_find_config; + x86_init.mpparse.get_smp_config = x86_init_uint_noop; + x86_init.mpparse.find_smp_config = x86_init_noop; #ifdef CONFIG_X86_IO_APIC x86_init.pci.init_irq = sdv_pci_init; From 1204e95689f9fbd245a4ce5c1b0cd0a9b77f8d25 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Feb 2011 17:17:18 +0100 Subject: [PATCH 469/706] genirq: Make warning in handle_percpu_event useful The WARN_ON_ONCE in handle_percpu_event() which emits a warning when an action handler returns with interrupts enabled is not really useful. It does not reveal the interrupt number and handler function which caused it. Make it WARN_ONCE() and add the information. Reported-by: Tony Luck Signed-off-by: Thomas Gleixner --- kernel/irq/handle.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index e099e9e9de0b..b110c835e070 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -64,7 +64,8 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) res = action->handler(irq, action->dev_id); trace_irq_handler_exit(irq, action, res); - if (WARN_ON_ONCE(!irqs_disabled())) + if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n", + irq, action->handler)) local_irq_disable(); switch (res) { From 702d4eb9b3de4398ab99cf0a4e799e552c7ab756 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Thu, 2 Dec 2010 17:54:50 +0000 Subject: [PATCH 470/706] xen: no need to delay xen_setup_shutdown_event for hvm guests anymore Now that xenstore_ready is used correctly for PV on HVM guests too, we don't need to delay the initialization of xen_setup_shutdown_event anymore. Signed-off-by: Stefano Stabellini Acked-by: Jeremy Fitzhardinge --- drivers/xen/manage.c | 17 ++++------------- drivers/xen/platform-pci.c | 3 --- 2 files changed, 4 insertions(+), 16 deletions(-) diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 24177272bcb8..b2a8d7856ce3 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -291,27 +291,18 @@ static int shutdown_event(struct notifier_block *notifier, return NOTIFY_DONE; } -static int __init __setup_shutdown_event(void) -{ - /* Delay initialization in the PV on HVM case */ - if (xen_hvm_domain()) - return 0; - - if (!xen_pv_domain()) - return -ENODEV; - - return xen_setup_shutdown_event(); -} - int xen_setup_shutdown_event(void) { static struct notifier_block xenstore_notifier = { .notifier_call = shutdown_event }; + + if (!xen_domain()) + return -ENODEV; register_xenstore_notifier(&xenstore_notifier); return 0; } EXPORT_SYMBOL_GPL(xen_setup_shutdown_event); -subsys_initcall(__setup_shutdown_event); +subsys_initcall(xen_setup_shutdown_event); diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c index afbe041f42c5..319dd0a94d51 100644 --- a/drivers/xen/platform-pci.c +++ b/drivers/xen/platform-pci.c @@ -156,9 +156,6 @@ static int __devinit platform_pci_init(struct pci_dev *pdev, if (ret) goto out; xenbus_probe(NULL); - ret = xen_setup_shutdown_event(); - if (ret) - goto out; return 0; out: From cff520b9c2ee1486ea9ff1dbc774510c62e5ecb9 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Thu, 2 Dec 2010 17:54:54 +0000 Subject: [PATCH 471/706] xen: do not use xen_info on HVM, set pv_info name to "Xen HVM" Signed-off-by: Stefano Stabellini Acked-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 50542efe45fb..2f67e2ea222e 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1284,8 +1284,7 @@ static int init_hvm_pv_info(int *major, int *minor) xen_setup_features(); - pv_info = xen_info; - pv_info.kernel_rpl = 0; + pv_info.name = "Xen HVM"; xen_domain_type = XEN_HVM_DOMAIN; From c80a420995e721099906607b07c09a24543b31d9 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Thu, 2 Dec 2010 17:55:00 +0000 Subject: [PATCH 472/706] xen-blkfront: handle Xen major numbers other than XENVBD This patch makes sure blkfront handles correctly virtual device numbers corresponding to Xen emulated IDE and SCSI disks: in those cases blkfront translates the major number to XENVBD and the minor number to a low xvd minor. Note: this behaviour is different from what old xenlinux PV guests used to do: they used to steal an IDE or SCSI major number and use it instead. Signed-off-by: Stefano Stabellini Acked-by: Jeremy Fitzhardinge --- drivers/block/xen-blkfront.c | 79 ++++++++++++++++++++++++++++++-- include/xen/interface/io/blkif.h | 21 +++++++++ 2 files changed, 95 insertions(+), 5 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index d7aa39e349a6..64d9c6dfc634 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -120,6 +120,10 @@ static DEFINE_SPINLOCK(minor_lock); #define EXTENDED (1<feature_flush ? "enabled" : "disabled"); } +static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) +{ + int major; + major = BLKIF_MAJOR(vdevice); + *minor = BLKIF_MINOR(vdevice); + switch (major) { + case XEN_IDE0_MAJOR: + *offset = (*minor / 64) + EMULATED_HD_DISK_NAME_OFFSET; + *minor = ((*minor / 64) * PARTS_PER_DISK) + + EMULATED_HD_DISK_MINOR_OFFSET; + break; + case XEN_IDE1_MAJOR: + *offset = (*minor / 64) + 2 + EMULATED_HD_DISK_NAME_OFFSET; + *minor = (((*minor / 64) + 2) * PARTS_PER_DISK) + + EMULATED_HD_DISK_MINOR_OFFSET; + break; + case XEN_SCSI_DISK0_MAJOR: + *offset = (*minor / PARTS_PER_DISK) + EMULATED_SD_DISK_NAME_OFFSET; + *minor = *minor + EMULATED_SD_DISK_MINOR_OFFSET; + break; + case XEN_SCSI_DISK1_MAJOR: + case XEN_SCSI_DISK2_MAJOR: + case XEN_SCSI_DISK3_MAJOR: + case XEN_SCSI_DISK4_MAJOR: + case XEN_SCSI_DISK5_MAJOR: + case XEN_SCSI_DISK6_MAJOR: + case XEN_SCSI_DISK7_MAJOR: + *offset = (*minor / PARTS_PER_DISK) + + ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16) + + EMULATED_SD_DISK_NAME_OFFSET; + *minor = *minor + + ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16 * PARTS_PER_DISK) + + EMULATED_SD_DISK_MINOR_OFFSET; + break; + case XEN_SCSI_DISK8_MAJOR: + case XEN_SCSI_DISK9_MAJOR: + case XEN_SCSI_DISK10_MAJOR: + case XEN_SCSI_DISK11_MAJOR: + case XEN_SCSI_DISK12_MAJOR: + case XEN_SCSI_DISK13_MAJOR: + case XEN_SCSI_DISK14_MAJOR: + case XEN_SCSI_DISK15_MAJOR: + *offset = (*minor / PARTS_PER_DISK) + + ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16) + + EMULATED_SD_DISK_NAME_OFFSET; + *minor = *minor + + ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16 * PARTS_PER_DISK) + + EMULATED_SD_DISK_MINOR_OFFSET; + break; + case XENVBD_MAJOR: + *offset = *minor / PARTS_PER_DISK; + break; + default: + printk(KERN_WARNING "blkfront: your disk configuration is " + "incorrect, please use an xvd device instead\n"); + return -ENODEV; + } + return 0; +} static int xlvbd_alloc_gendisk(blkif_sector_t capacity, struct blkfront_info *info, @@ -441,7 +504,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, { struct gendisk *gd; int nr_minors = 1; - int err = -ENODEV; + int err; unsigned int offset; int minor; int nr_parts; @@ -456,12 +519,20 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, } if (!VDEV_IS_EXTENDED(info->vdevice)) { - minor = BLKIF_MINOR(info->vdevice); - nr_parts = PARTS_PER_DISK; + err = xen_translate_vdev(info->vdevice, &minor, &offset); + if (err) + return err; + nr_parts = PARTS_PER_DISK; } else { minor = BLKIF_MINOR_EXT(info->vdevice); nr_parts = PARTS_PER_EXT_DISK; + offset = minor / nr_parts; + if (xen_hvm_domain() && offset <= EMULATED_HD_DISK_NAME_OFFSET + 4) + printk(KERN_WARNING "blkfront: vdevice 0x%x might conflict with " + "emulated IDE disks,\n\t choose an xvd device name" + "from xvde on\n", info->vdevice); } + err = -ENODEV; if ((minor % nr_parts) == 0) nr_minors = nr_parts; @@ -475,8 +546,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, if (gd == NULL) goto release; - offset = minor / nr_parts; - if (nr_minors > 1) { if (offset < 26) sprintf(gd->disk_name, "%s%c", DEV_NAME, 'a' + offset); diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h index c2d1fa4dc1ee..68dd2b49635c 100644 --- a/include/xen/interface/io/blkif.h +++ b/include/xen/interface/io/blkif.h @@ -91,4 +91,25 @@ DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response); #define VDISK_REMOVABLE 0x2 #define VDISK_READONLY 0x4 +/* Xen-defined major numbers for virtual disks, they look strangely + * familiar */ +#define XEN_IDE0_MAJOR 3 +#define XEN_IDE1_MAJOR 22 +#define XEN_SCSI_DISK0_MAJOR 8 +#define XEN_SCSI_DISK1_MAJOR 65 +#define XEN_SCSI_DISK2_MAJOR 66 +#define XEN_SCSI_DISK3_MAJOR 67 +#define XEN_SCSI_DISK4_MAJOR 68 +#define XEN_SCSI_DISK5_MAJOR 69 +#define XEN_SCSI_DISK6_MAJOR 70 +#define XEN_SCSI_DISK7_MAJOR 71 +#define XEN_SCSI_DISK8_MAJOR 128 +#define XEN_SCSI_DISK9_MAJOR 129 +#define XEN_SCSI_DISK10_MAJOR 130 +#define XEN_SCSI_DISK11_MAJOR 131 +#define XEN_SCSI_DISK12_MAJOR 132 +#define XEN_SCSI_DISK13_MAJOR 133 +#define XEN_SCSI_DISK14_MAJOR 134 +#define XEN_SCSI_DISK15_MAJOR 135 + #endif /* __XEN_PUBLIC_IO_BLKIF_H__ */ From 53d5522cad291a0e93a385e0594b6aea6b54a071 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Thu, 2 Dec 2010 17:55:05 +0000 Subject: [PATCH 473/706] xen: make the ballon driver work for hvm domains Signed-off-by: Stefano Stabellini --- drivers/xen/balloon.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 43f9f02c7db0..9294f25dcb2c 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -232,7 +232,7 @@ static int increase_reservation(unsigned long nr_pages) set_phys_to_machine(pfn, frame_list[i]); /* Link back into the page tables if not highmem. */ - if (pfn < max_low_pfn) { + if (!xen_hvm_domain() && pfn < max_low_pfn) { int ret; ret = HYPERVISOR_update_va_mapping( (unsigned long)__va(pfn << PAGE_SHIFT), @@ -280,7 +280,7 @@ static int decrease_reservation(unsigned long nr_pages) scrub_page(page); - if (!PageHighMem(page)) { + if (!xen_hvm_domain() && !PageHighMem(page)) { ret = HYPERVISOR_update_va_mapping( (unsigned long)__va(pfn << PAGE_SHIFT), __pte_ma(0), 0); @@ -392,15 +392,19 @@ static struct notifier_block xenstore_notifier; static int __init balloon_init(void) { - unsigned long pfn, extra_pfn_end; + unsigned long pfn, nr_pages, extra_pfn_end; struct page *page; - if (!xen_pv_domain()) + if (!xen_domain()) return -ENODEV; pr_info("xen_balloon: Initialising balloon driver.\n"); - balloon_stats.current_pages = min(xen_start_info->nr_pages, max_pfn); + if (xen_pv_domain()) + nr_pages = xen_start_info->nr_pages; + else + nr_pages = max_pfn; + balloon_stats.current_pages = min(nr_pages, max_pfn); balloon_stats.target_pages = balloon_stats.current_pages; balloon_stats.balloon_low = 0; balloon_stats.balloon_high = 0; From 99bbb3a84a99cd04ab16b998b20f01a72cfa9f4f Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Thu, 2 Dec 2010 17:55:10 +0000 Subject: [PATCH 474/706] xen: PV on HVM: support PV spinlocks and IPIs Initialize PV spinlocks on boot CPU right after native_smp_prepare_cpus (that switch to APIC mode and initialize APIC routing); on secondary CPUs on CPU_UP_PREPARE. Enable the usage of event channels to send and receive IPIs when running as a PV on HVM guest. Signed-off-by: Stefano Stabellini --- arch/x86/xen/enlighten.c | 3 +++ arch/x86/xen/smp.c | 38 ++++++++++++++++++++++++++++++++++++++ arch/x86/xen/xen-ops.h | 2 ++ 3 files changed, 43 insertions(+) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 2f67e2ea222e..fe02574789c5 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1330,6 +1330,8 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, switch (action) { case CPU_UP_PREPARE: per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; + if (xen_have_vector_callback) + xen_init_lock_cpu(cpu); break; default: break; @@ -1354,6 +1356,7 @@ static void __init xen_hvm_guest_init(void) if (xen_feature(XENFEAT_hvm_callback_vector)) xen_have_vector_callback = 1; + xen_hvm_smp_init(); register_cpu_notifier(&xen_hvm_cpu_notifier); xen_unplug_emulated_devices(); have_vcpu_info_placement = 0; diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 72a4c7959045..30612441ed99 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -509,3 +509,41 @@ void __init xen_smp_init(void) xen_fill_possible_map(); xen_init_spinlocks(); } + +static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) +{ + native_smp_prepare_cpus(max_cpus); + WARN_ON(xen_smp_intr_init(0)); + + if (!xen_have_vector_callback) + return; + xen_init_lock_cpu(0); + xen_init_spinlocks(); +} + +static int __cpuinit xen_hvm_cpu_up(unsigned int cpu) +{ + int rc; + rc = native_cpu_up(cpu); + WARN_ON (xen_smp_intr_init(cpu)); + return rc; +} + +static void xen_hvm_cpu_die(unsigned int cpu) +{ + unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL); + native_cpu_die(cpu); +} + +void __init xen_hvm_smp_init(void) +{ + smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; + smp_ops.smp_send_reschedule = xen_smp_send_reschedule; + smp_ops.cpu_up = xen_hvm_cpu_up; + smp_ops.cpu_die = xen_hvm_cpu_die; + smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; + smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; +} diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 9d41bf985757..3112f55638c4 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -64,10 +64,12 @@ void xen_setup_vcpu_info_placement(void); #ifdef CONFIG_SMP void xen_smp_init(void); +void __init xen_hvm_smp_init(void); extern cpumask_var_t xen_cpu_initialized_map; #else static inline void xen_smp_init(void) {} +static inline void xen_hvm_smp_init(void) {} #endif #ifdef CONFIG_PARAVIRT_SPINLOCKS From e057a4b6e0eb6701f6ec923be2075d4984cef51a Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Fri, 11 Feb 2011 17:55:13 +0000 Subject: [PATCH 475/706] xen: fix compile issue if XEN is enabled but XEN_PVHVM is disabled Signed-off-by: Stefano Stabellini --- arch/x86/xen/suspend.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 9bbd63a129b5..4a3d3dd3dd30 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -28,6 +28,7 @@ void xen_pre_suspend(void) void xen_hvm_post_suspend(int suspend_cancelled) { +#ifdef CONFIG_XEN_PVHVM int cpu; xen_hvm_init_shared_info(); xen_callback_vector(); @@ -37,6 +38,7 @@ void xen_hvm_post_suspend(int suspend_cancelled) xen_setup_runstate_info(cpu); } } +#endif } void xen_post_suspend(int suspend_cancelled) From 552717231e50b478dfd19d63fd97879476ae051d Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Thu, 17 Feb 2011 11:04:20 +0000 Subject: [PATCH 476/706] xen: do not respond to unknown xenstore control requests The PV xenbus control/shutdown node is written by the toolstack as a request to the guest to perform a particular action (shutdown, reboot, suspend etc). The guest is expected to acknowledge that it will complete a request by clearing the control node. Previously it would acknowledge any request, even if it did not know what to do with it. Specifically in the case where CONFIG_PM_SLEEP is not enabled the kernel would acknowledge a suspend request even though it was not actually going to do anything. Instead make the kernel only acknowledge requests if it is actually going to do something with it. This will improve the toolstack's ability to diagnose and deal with failures. Signed-off-by: Ian Campbell Reviewed-by: Konrad Rzeszutek Wilk --- drivers/xen/manage.c | 49 +++++++++++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 12 deletions(-) diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index b2a8d7856ce3..972bf783a182 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -172,12 +172,39 @@ out: } #endif /* CONFIG_PM_SLEEP */ +struct shutdown_handler { + const char *command; + void (*cb)(void); +}; + +static void do_poweroff(void) +{ + shutting_down = SHUTDOWN_POWEROFF; + orderly_poweroff(false); +} + +static void do_reboot(void) +{ + shutting_down = SHUTDOWN_POWEROFF; /* ? */ + ctrl_alt_del(); +} + static void shutdown_handler(struct xenbus_watch *watch, const char **vec, unsigned int len) { char *str; struct xenbus_transaction xbt; int err; + static struct shutdown_handler handlers[] = { + { "poweroff", do_poweroff }, + { "halt", do_poweroff }, + { "reboot", do_reboot }, +#ifdef CONFIG_PM_SLEEP + { "suspend", do_suspend }, +#endif + {NULL, NULL}, + }; + static struct shutdown_handler *handler; if (shutting_down != SHUTDOWN_INVALID) return; @@ -194,7 +221,14 @@ static void shutdown_handler(struct xenbus_watch *watch, return; } - xenbus_write(xbt, "control", "shutdown", ""); + for (handler = &handlers[0]; handler->command; handler++) { + if (strcmp(str, handler->command) == 0) + break; + } + + /* Only acknowledge commands which we are prepared to handle. */ + if (handler->cb) + xenbus_write(xbt, "control", "shutdown", ""); err = xenbus_transaction_end(xbt, 0); if (err == -EAGAIN) { @@ -202,17 +236,8 @@ static void shutdown_handler(struct xenbus_watch *watch, goto again; } - if (strcmp(str, "poweroff") == 0 || - strcmp(str, "halt") == 0) { - shutting_down = SHUTDOWN_POWEROFF; - orderly_poweroff(false); - } else if (strcmp(str, "reboot") == 0) { - shutting_down = SHUTDOWN_POWEROFF; /* ? */ - ctrl_alt_del(); -#ifdef CONFIG_PM_SLEEP - } else if (strcmp(str, "suspend") == 0) { - do_suspend(); -#endif + if (handler->cb) { + handler->cb(); } else { printk(KERN_INFO "Ignoring shutdown request: %s\n", str); shutting_down = SHUTDOWN_INVALID; From 8e15597fa430c03415e2268dfbae0f262b948788 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Thu, 17 Feb 2011 11:04:20 +0000 Subject: [PATCH 477/706] xen: use new schedop interface for suspend Take the opportunity to comment on the semantics of the PV guest suspend hypercall arguments. Signed-off-by: Ian Campbell Reviewed-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/hypercall.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index a3c28ae4025b..a7d5823862b4 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -422,10 +422,17 @@ HYPERVISOR_set_segment_base(int reg, unsigned long value) #endif static inline int -HYPERVISOR_suspend(unsigned long srec) +HYPERVISOR_suspend(unsigned long start_info_mfn) { - return _hypercall3(int, sched_op, SCHEDOP_shutdown, - SHUTDOWN_suspend, srec); + struct sched_shutdown r = { .reason = SHUTDOWN_suspend }; + + /* + * For a PV guest the tools require that the start_info mfn be + * present in rdx/edx when the hypercall is made. Per the + * hypercall calling convention this is the third hypercall + * argument, which is start_info_mfn here. + */ + return _hypercall3(int, sched_op_new, SCHEDOP_shutdown, &r, start_info_mfn); } static inline int From a8b7458363b9174f3c2196ca6085630b4b30b7a1 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Thu, 17 Feb 2011 11:04:20 +0000 Subject: [PATCH 478/706] xen: switch to new schedop hypercall by default. Rename old interface to sched_op_compat and rename sched_op_new to simply sched_op. Signed-off-by: Ian Campbell Reviewed-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/hypercall.h | 4 ++-- include/xen/interface/xen.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index a7d5823862b4..8508bfe52296 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -287,7 +287,7 @@ HYPERVISOR_fpu_taskswitch(int set) static inline int HYPERVISOR_sched_op(int cmd, void *arg) { - return _hypercall2(int, sched_op_new, cmd, arg); + return _hypercall2(int, sched_op, cmd, arg); } static inline long @@ -432,7 +432,7 @@ HYPERVISOR_suspend(unsigned long start_info_mfn) * hypercall calling convention this is the third hypercall * argument, which is start_info_mfn here. */ - return _hypercall3(int, sched_op_new, SCHEDOP_shutdown, &r, start_info_mfn); + return _hypercall3(int, sched_op, SCHEDOP_shutdown, &r, start_info_mfn); } static inline int diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index 2befa3e2f1bc..b33257bc7e83 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h @@ -30,7 +30,7 @@ #define __HYPERVISOR_stack_switch 3 #define __HYPERVISOR_set_callbacks 4 #define __HYPERVISOR_fpu_taskswitch 5 -#define __HYPERVISOR_sched_op 6 +#define __HYPERVISOR_sched_op_compat 6 #define __HYPERVISOR_dom0_op 7 #define __HYPERVISOR_set_debugreg 8 #define __HYPERVISOR_get_debugreg 9 @@ -52,7 +52,7 @@ #define __HYPERVISOR_mmuext_op 26 #define __HYPERVISOR_acm_op 27 #define __HYPERVISOR_nmi_op 28 -#define __HYPERVISOR_sched_op_new 29 +#define __HYPERVISOR_sched_op 29 #define __HYPERVISOR_callback_op 30 #define __HYPERVISOR_xenoprof_op 31 #define __HYPERVISOR_event_channel_op 32 From bd1c0ad28451df4610d352c7e438213c84de0c28 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Thu, 17 Feb 2011 11:04:20 +0000 Subject: [PATCH 479/706] xen: suspend: use HYPERVISOR_suspend for PVHVM case instead of open coding Signed-off-by: Ian Campbell Reviewed-by: Konrad Rzeszutek Wilk --- drivers/xen/manage.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 972bf783a182..4dd01865ad18 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -38,7 +38,6 @@ static enum shutdown_state shutting_down = SHUTDOWN_INVALID; static int xen_hvm_suspend(void *data) { int err; - struct sched_shutdown r = { .reason = SHUTDOWN_suspend }; int *cancelled = data; BUG_ON(!irqs_disabled()); @@ -50,7 +49,12 @@ static int xen_hvm_suspend(void *data) return err; } - *cancelled = HYPERVISOR_sched_op(SCHEDOP_shutdown, &r); + /* + * This hypercall returns 1 if suspend was cancelled + * or the domain was merely checkpointed, and 0 if it + * is resuming in a new domain. + */ + *cancelled = HYPERVISOR_suspend(0UL); xen_hvm_post_suspend(*cancelled); gnttab_resume(); From ceb180294790c8a6a437533488616f6b591b49d0 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Thu, 17 Feb 2011 11:04:20 +0000 Subject: [PATCH 480/706] xen: suspend: refactor cancellation flag into a structure Will add extra fields in subsequent patches. Signed-off-by: Ian Campbell Reviewed-by: Konrad Rzeszutek Wilk --- drivers/xen/manage.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 4dd01865ad18..5c0184fb9d84 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -34,11 +34,15 @@ enum shutdown_state { /* Ignore multiple shutdown requests. */ static enum shutdown_state shutting_down = SHUTDOWN_INVALID; +struct suspend_info { + int cancelled; +}; + #ifdef CONFIG_PM_SLEEP static int xen_hvm_suspend(void *data) { + struct suspend_info *si = data; int err; - int *cancelled = data; BUG_ON(!irqs_disabled()); @@ -54,12 +58,12 @@ static int xen_hvm_suspend(void *data) * or the domain was merely checkpointed, and 0 if it * is resuming in a new domain. */ - *cancelled = HYPERVISOR_suspend(0UL); + si->cancelled = HYPERVISOR_suspend(0UL); - xen_hvm_post_suspend(*cancelled); + xen_hvm_post_suspend(si->cancelled); gnttab_resume(); - if (!*cancelled) { + if (!si->cancelled) { xen_irq_resume(); xen_console_resume(); xen_timer_resume(); @@ -72,8 +76,8 @@ static int xen_hvm_suspend(void *data) static int xen_suspend(void *data) { + struct suspend_info *si = data; int err; - int *cancelled = data; BUG_ON(!irqs_disabled()); @@ -93,13 +97,13 @@ static int xen_suspend(void *data) * or the domain was merely checkpointed, and 0 if it * is resuming in a new domain. */ - *cancelled = HYPERVISOR_suspend(virt_to_mfn(xen_start_info)); + si->cancelled = HYPERVISOR_suspend(virt_to_mfn(xen_start_info)); - xen_post_suspend(*cancelled); + xen_post_suspend(si->cancelled); gnttab_resume(); xen_mm_unpin_all(); - if (!*cancelled) { + if (!si->cancelled) { xen_irq_resume(); xen_console_resume(); xen_timer_resume(); @@ -113,7 +117,7 @@ static int xen_suspend(void *data) static void do_suspend(void) { int err; - int cancelled = 1; + struct suspend_info si; shutting_down = SHUTDOWN_SUSPEND; @@ -143,20 +147,22 @@ static void do_suspend(void) goto out_resume; } + si.cancelled = 1; + if (xen_hvm_domain()) - err = stop_machine(xen_hvm_suspend, &cancelled, cpumask_of(0)); + err = stop_machine(xen_hvm_suspend, &si, cpumask_of(0)); else - err = stop_machine(xen_suspend, &cancelled, cpumask_of(0)); + err = stop_machine(xen_suspend, &si, cpumask_of(0)); dpm_resume_noirq(PMSG_RESUME); if (err) { printk(KERN_ERR "failed to start xen_suspend: %d\n", err); - cancelled = 1; + si.cancelled = 1; } out_resume: - if (!cancelled) { + if (!si.cancelled) { xen_arch_resume(); xs_resume(); } else From 36b401e2c2788c7b4881115ddbbff603fe4cf78d Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Thu, 17 Feb 2011 11:04:20 +0000 Subject: [PATCH 481/706] xen: suspend: pass extra hypercall argument via suspend_info struct Signed-off-by: Ian Campbell Reviewed-by: Konrad Rzeszutek Wilk --- drivers/xen/manage.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 5c0184fb9d84..6ce6b91e7645 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -36,6 +36,7 @@ static enum shutdown_state shutting_down = SHUTDOWN_INVALID; struct suspend_info { int cancelled; + unsigned long arg; /* extra hypercall argument */ }; #ifdef CONFIG_PM_SLEEP @@ -58,7 +59,7 @@ static int xen_hvm_suspend(void *data) * or the domain was merely checkpointed, and 0 if it * is resuming in a new domain. */ - si->cancelled = HYPERVISOR_suspend(0UL); + si->cancelled = HYPERVISOR_suspend(si->arg); xen_hvm_post_suspend(si->cancelled); gnttab_resume(); @@ -97,7 +98,7 @@ static int xen_suspend(void *data) * or the domain was merely checkpointed, and 0 if it * is resuming in a new domain. */ - si->cancelled = HYPERVISOR_suspend(virt_to_mfn(xen_start_info)); + si->cancelled = HYPERVISOR_suspend(si->arg); xen_post_suspend(si->cancelled); gnttab_resume(); @@ -149,6 +150,11 @@ static void do_suspend(void) si.cancelled = 1; + if (xen_hvm_domain()) + si.arg = 0UL; + else + si.arg = virt_to_mfn(xen_start_info); + if (xen_hvm_domain()) err = stop_machine(xen_hvm_suspend, &si, cpumask_of(0)); else From 03c8142bd2fb3b87effa6ecb2f8957be588bc85f Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Thu, 17 Feb 2011 11:04:20 +0000 Subject: [PATCH 482/706] xen: suspend: add "arch" to pre/post suspend hooks xen_pre_device_suspend is unused on ia64. Signed-off-by: Ian Campbell Reviewed-by: Konrad Rzeszutek Wilk --- arch/ia64/xen/suspend.c | 11 +++-------- arch/x86/xen/suspend.c | 6 +++--- drivers/xen/manage.c | 6 +++--- include/xen/xen-ops.h | 6 +++--- 4 files changed, 12 insertions(+), 17 deletions(-) diff --git a/arch/ia64/xen/suspend.c b/arch/ia64/xen/suspend.c index fd66b048c6fa..419c8620945a 100644 --- a/arch/ia64/xen/suspend.c +++ b/arch/ia64/xen/suspend.c @@ -37,19 +37,14 @@ xen_mm_unpin_all(void) /* nothing */ } -void xen_pre_device_suspend(void) +void +xen_arch_pre_suspend() { /* nothing */ } void -xen_pre_suspend() -{ - /* nothing */ -} - -void -xen_post_suspend(int suspend_cancelled) +xen_arch_post_suspend(int suspend_cancelled) { if (suspend_cancelled) return; diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 4a3d3dd3dd30..45329c8c226e 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -12,7 +12,7 @@ #include "xen-ops.h" #include "mmu.h" -void xen_pre_suspend(void) +void xen_arch_pre_suspend(void) { xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); xen_start_info->console.domU.mfn = @@ -26,7 +26,7 @@ void xen_pre_suspend(void) BUG(); } -void xen_hvm_post_suspend(int suspend_cancelled) +void xen_arch_hvm_post_suspend(int suspend_cancelled) { #ifdef CONFIG_XEN_PVHVM int cpu; @@ -41,7 +41,7 @@ void xen_hvm_post_suspend(int suspend_cancelled) #endif } -void xen_post_suspend(int suspend_cancelled) +void xen_arch_post_suspend(int suspend_cancelled) { xen_build_mfn_list_list(); diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 6ce6b91e7645..134eb73ca596 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -61,7 +61,7 @@ static int xen_hvm_suspend(void *data) */ si->cancelled = HYPERVISOR_suspend(si->arg); - xen_hvm_post_suspend(si->cancelled); + xen_arch_hvm_post_suspend(si->cancelled); gnttab_resume(); if (!si->cancelled) { @@ -91,7 +91,7 @@ static int xen_suspend(void *data) xen_mm_pin_all(); gnttab_suspend(); - xen_pre_suspend(); + xen_arch_pre_suspend(); /* * This hypercall returns 1 if suspend was cancelled @@ -100,7 +100,7 @@ static int xen_suspend(void *data) */ si->cancelled = HYPERVISOR_suspend(si->arg); - xen_post_suspend(si->cancelled); + xen_arch_post_suspend(si->cancelled); gnttab_resume(); xen_mm_unpin_all(); diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index 98b92154a264..03c85d7387fb 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -5,9 +5,9 @@ DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); -void xen_pre_suspend(void); -void xen_post_suspend(int suspend_cancelled); -void xen_hvm_post_suspend(int suspend_cancelled); +void xen_arch_pre_suspend(void); +void xen_arch_post_suspend(int suspend_cancelled); +void xen_arch_hvm_post_suspend(int suspend_cancelled); void xen_mm_pin_all(void); void xen_mm_unpin_all(void); From 82043bb60d24d2897074905c94be5a53071e8913 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Thu, 17 Feb 2011 11:04:20 +0000 Subject: [PATCH 483/706] xen: suspend: refactor non-arch specific pre/post suspend hooks Signed-off-by: Ian Campbell Reviewed-by: Konrad Rzeszutek Wilk --- drivers/xen/manage.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 134eb73ca596..33312c09829e 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -39,6 +39,23 @@ struct suspend_info { unsigned long arg; /* extra hypercall argument */ }; +static void xen_hvm_post_suspend(void) +{ + gnttab_resume(); +} + +static void xen_pre_suspend(void) +{ + xen_mm_pin_all(); + gnttab_suspend(); +} + +static void xen_post_suspend(void) +{ + gnttab_resume(); + xen_mm_unpin_all(); +} + #ifdef CONFIG_PM_SLEEP static int xen_hvm_suspend(void *data) { @@ -62,7 +79,7 @@ static int xen_hvm_suspend(void *data) si->cancelled = HYPERVISOR_suspend(si->arg); xen_arch_hvm_post_suspend(si->cancelled); - gnttab_resume(); + xen_hvm_post_suspend(); if (!si->cancelled) { xen_irq_resume(); @@ -89,8 +106,7 @@ static int xen_suspend(void *data) return err; } - xen_mm_pin_all(); - gnttab_suspend(); + xen_pre_suspend(); xen_arch_pre_suspend(); /* @@ -101,8 +117,7 @@ static int xen_suspend(void *data) si->cancelled = HYPERVISOR_suspend(si->arg); xen_arch_post_suspend(si->cancelled); - gnttab_resume(); - xen_mm_unpin_all(); + xen_post_suspend(); if (!si->cancelled) { xen_irq_resume(); From 07af38102fc4f260cc5a2418ec833707f53cdf70 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Thu, 17 Feb 2011 11:04:20 +0000 Subject: [PATCH 484/706] xen: suspend: move arch specific pre/post suspend hooks into generic hooks Signed-off-by: Ian Campbell Reviewed-by: Konrad Rzeszutek Wilk --- drivers/xen/manage.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 33312c09829e..a6bd2e9ca106 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -39,8 +39,9 @@ struct suspend_info { unsigned long arg; /* extra hypercall argument */ }; -static void xen_hvm_post_suspend(void) +static void xen_hvm_post_suspend(int cancelled) { + xen_arch_hvm_post_suspend(cancelled); gnttab_resume(); } @@ -48,10 +49,12 @@ static void xen_pre_suspend(void) { xen_mm_pin_all(); gnttab_suspend(); + xen_arch_pre_suspend(); } -static void xen_post_suspend(void) +static void xen_post_suspend(int cancelled) { + xen_arch_post_suspend(cancelled); gnttab_resume(); xen_mm_unpin_all(); } @@ -78,8 +81,7 @@ static int xen_hvm_suspend(void *data) */ si->cancelled = HYPERVISOR_suspend(si->arg); - xen_arch_hvm_post_suspend(si->cancelled); - xen_hvm_post_suspend(); + xen_hvm_post_suspend(si->cancelled); if (!si->cancelled) { xen_irq_resume(); @@ -107,7 +109,6 @@ static int xen_suspend(void *data) } xen_pre_suspend(); - xen_arch_pre_suspend(); /* * This hypercall returns 1 if suspend was cancelled @@ -116,8 +117,7 @@ static int xen_suspend(void *data) */ si->cancelled = HYPERVISOR_suspend(si->arg); - xen_arch_post_suspend(si->cancelled); - xen_post_suspend(); + xen_post_suspend(si->cancelled); if (!si->cancelled) { xen_irq_resume(); From 55fb4acef7089a6d4d93ed8caae6c258d06cfaf7 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Thu, 17 Feb 2011 11:04:20 +0000 Subject: [PATCH 485/706] xen: suspend: pull pre/post suspend hooks out into suspend_info Signed-off-by: Ian Campbell Reviewed-by: Konrad Rzeszutek Wilk --- drivers/xen/manage.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index a6bd2e9ca106..5b7a0a9402e7 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -37,6 +37,8 @@ static enum shutdown_state shutting_down = SHUTDOWN_INVALID; struct suspend_info { int cancelled; unsigned long arg; /* extra hypercall argument */ + void (*pre)(void); + void (*post)(int cancelled); }; static void xen_hvm_post_suspend(int cancelled) @@ -74,6 +76,9 @@ static int xen_hvm_suspend(void *data) return err; } + if (si->pre) + si->pre(); + /* * This hypercall returns 1 if suspend was cancelled * or the domain was merely checkpointed, and 0 if it @@ -81,7 +86,8 @@ static int xen_hvm_suspend(void *data) */ si->cancelled = HYPERVISOR_suspend(si->arg); - xen_hvm_post_suspend(si->cancelled); + if (si->post) + si->post(si->cancelled); if (!si->cancelled) { xen_irq_resume(); @@ -108,7 +114,8 @@ static int xen_suspend(void *data) return err; } - xen_pre_suspend(); + if (si->pre) + si->pre(); /* * This hypercall returns 1 if suspend was cancelled @@ -117,7 +124,8 @@ static int xen_suspend(void *data) */ si->cancelled = HYPERVISOR_suspend(si->arg); - xen_post_suspend(si->cancelled); + if (si->post) + si->post(si->cancelled); if (!si->cancelled) { xen_irq_resume(); @@ -165,10 +173,15 @@ static void do_suspend(void) si.cancelled = 1; - if (xen_hvm_domain()) + if (xen_hvm_domain()) { si.arg = 0UL; - else + si.pre = NULL; + si.post = &xen_hvm_post_suspend; + } else { si.arg = virt_to_mfn(xen_start_info); + si.pre = &xen_pre_suspend; + si.post = &xen_post_suspend; + } if (xen_hvm_domain()) err = stop_machine(xen_hvm_suspend, &si, cpumask_of(0)); From b056b6a0144de90707cd22cf7b4f60bf69c86d59 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Thu, 17 Feb 2011 11:04:20 +0000 Subject: [PATCH 486/706] xen: suspend: remove xen_hvm_suspend It is now identical to xen_suspend, the differences are encapsulated in the suspend_info struct. Signed-off-by: Ian Campbell Reviewed-by: Konrad Rzeszutek Wilk --- drivers/xen/manage.c | 43 +------------------------------------------ 1 file changed, 1 insertion(+), 42 deletions(-) diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 5b7a0a9402e7..ebb292859b59 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -62,44 +62,6 @@ static void xen_post_suspend(int cancelled) } #ifdef CONFIG_PM_SLEEP -static int xen_hvm_suspend(void *data) -{ - struct suspend_info *si = data; - int err; - - BUG_ON(!irqs_disabled()); - - err = sysdev_suspend(PMSG_SUSPEND); - if (err) { - printk(KERN_ERR "xen_hvm_suspend: sysdev_suspend failed: %d\n", - err); - return err; - } - - if (si->pre) - si->pre(); - - /* - * This hypercall returns 1 if suspend was cancelled - * or the domain was merely checkpointed, and 0 if it - * is resuming in a new domain. - */ - si->cancelled = HYPERVISOR_suspend(si->arg); - - if (si->post) - si->post(si->cancelled); - - if (!si->cancelled) { - xen_irq_resume(); - xen_console_resume(); - xen_timer_resume(); - } - - sysdev_resume(); - - return 0; -} - static int xen_suspend(void *data) { struct suspend_info *si = data; @@ -183,10 +145,7 @@ static void do_suspend(void) si.post = &xen_post_suspend; } - if (xen_hvm_domain()) - err = stop_machine(xen_hvm_suspend, &si, cpumask_of(0)); - else - err = stop_machine(xen_suspend, &si, cpumask_of(0)); + err = stop_machine(xen_suspend, &si, cpumask_of(0)); dpm_resume_noirq(PMSG_RESUME); From b5faba21a6805c33b40e258d36f57997ee1de131 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Feb 2011 23:52:13 +0000 Subject: [PATCH 487/706] genirq: Prepare the handling of shared oneshot interrupts For level type interrupts we need to track how many threads are on flight to avoid useless interrupt storms when not all thread handlers have finished yet. Keep track of the woken threads and only unmask when there are no more threads in flight. Yes, I'm lazy and using a bitfield. But not only because I'm lazy, the main reason is that it's way simpler than using a refcount. A refcount based solution would need to keep track of various things like crashing the irq thread, spurious interrupts coming in, disables/enables, free_irq() and some more. The bitfield keeps the tracking simple and makes things just work. It's also nicely confined to the thread code pathes and does not require additional checks all over the place. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20110223234956.388095876@linutronix.de> --- include/linux/interrupt.h | 2 ++ include/linux/irqdesc.h | 2 ++ kernel/irq/handle.c | 76 ++++++++++++++++++++++++++++++++------- kernel/irq/manage.c | 54 ++++++++++++++++++++++++---- 4 files changed, 115 insertions(+), 19 deletions(-) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 8da6643e39a6..e116fef274cd 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -99,6 +99,7 @@ typedef irqreturn_t (*irq_handler_t)(int, void *); * @thread_fn: interupt handler function for threaded interrupts * @thread: thread pointer for threaded interrupts * @thread_flags: flags related to @thread + * @thread_mask: bitmask for keeping track of @thread activity */ struct irqaction { irq_handler_t handler; @@ -109,6 +110,7 @@ struct irqaction { irq_handler_t thread_fn; struct task_struct *thread; unsigned long thread_flags; + unsigned long thread_mask; const char *name; struct proc_dir_entry *dir; } ____cacheline_internodealigned_in_smp; diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 2f87d6441302..9eb9cd313052 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -28,6 +28,7 @@ struct timer_rand_state; * @lock: locking for SMP * @affinity_notify: context for notification of affinity changes * @pending_mask: pending rebalanced interrupts + * @threads_oneshot: bitfield to handle shared oneshot threads * @threads_active: number of irqaction threads currently running * @wait_for_threads: wait queue for sync_irq to wait for threaded handlers * @dir: /proc/irq/ procfs entry @@ -86,6 +87,7 @@ struct irq_desc { cpumask_var_t pending_mask; #endif #endif + unsigned long threads_oneshot; atomic_t threads_active; wait_queue_head_t wait_for_threads; #ifdef CONFIG_PROC_FS diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index b110c835e070..517561fc7317 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -51,6 +51,68 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action) "but no thread function available.", irq, action->name); } +static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action) +{ + /* + * Wake up the handler thread for this action. In case the + * thread crashed and was killed we just pretend that we + * handled the interrupt. The hardirq handler has disabled the + * device interrupt, so no irq storm is lurking. If the + * RUNTHREAD bit is already set, nothing to do. + */ + if (test_bit(IRQTF_DIED, &action->thread_flags) || + test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags)) + return; + + /* + * It's safe to OR the mask lockless here. We have only two + * places which write to threads_oneshot: This code and the + * irq thread. + * + * This code is the hard irq context and can never run on two + * cpus in parallel. If it ever does we have more serious + * problems than this bitmask. + * + * The irq threads of this irq which clear their "running" bit + * in threads_oneshot are serialized via desc->lock against + * each other and they are serialized against this code by + * IRQS_INPROGRESS. + * + * Hard irq handler: + * + * spin_lock(desc->lock); + * desc->state |= IRQS_INPROGRESS; + * spin_unlock(desc->lock); + * set_bit(IRQTF_RUNTHREAD, &action->thread_flags); + * desc->threads_oneshot |= mask; + * spin_lock(desc->lock); + * desc->state &= ~IRQS_INPROGRESS; + * spin_unlock(desc->lock); + * + * irq thread: + * + * again: + * spin_lock(desc->lock); + * if (desc->state & IRQS_INPROGRESS) { + * spin_unlock(desc->lock); + * while(desc->state & IRQS_INPROGRESS) + * cpu_relax(); + * goto again; + * } + * if (!test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) + * desc->threads_oneshot &= ~mask; + * spin_unlock(desc->lock); + * + * So either the thread waits for us to clear IRQS_INPROGRESS + * or we are waiting in the flow handler for desc->lock to be + * released before we reach this point. The thread also checks + * IRQTF_RUNTHREAD under desc->lock. If set it leaves + * threads_oneshot untouched and runs the thread another time. + */ + desc->threads_oneshot |= action->thread_mask; + wake_up_process(action->thread); +} + irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) { @@ -85,19 +147,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) break; } - /* - * Wake up the handler thread for this - * action. In case the thread crashed and was - * killed we just pretend that we handled the - * interrupt. The hardirq handler above has - * disabled the device interrupt, so no irq - * storm is lurking. - */ - if (likely(!test_bit(IRQTF_DIED, - &action->thread_flags))) { - set_bit(IRQTF_RUNTHREAD, &action->thread_flags); - wake_up_process(action->thread); - } + irq_wake_thread(desc, action); /* Fall through to add to randomness */ case IRQ_HANDLED: diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 01f8a9519e63..2301de19ac7d 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -617,8 +617,11 @@ static int irq_wait_for_interrupt(struct irqaction *action) * handler finished. unmask if the interrupt has not been disabled and * is marked MASKED. */ -static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) +static void irq_finalize_oneshot(struct irq_desc *desc, + struct irqaction *action, bool force) { + if (!(desc->istate & IRQS_ONESHOT)) + return; again: chip_bus_lock(desc); raw_spin_lock_irq(&desc->lock); @@ -631,6 +634,11 @@ again: * on the other CPU. If we unmask the irq line then the * interrupt can come in again and masks the line, leaves due * to IRQS_INPROGRESS and the irq line is masked forever. + * + * This also serializes the state of shared oneshot handlers + * versus "desc->threads_onehsot |= action->thread_mask;" in + * irq_wake_thread(). See the comment there which explains the + * serialization. */ if (unlikely(desc->istate & IRQS_INPROGRESS)) { raw_spin_unlock_irq(&desc->lock); @@ -639,11 +647,23 @@ again: goto again; } - if (!(desc->istate & IRQS_DISABLED) && (desc->istate & IRQS_MASKED)) { + /* + * Now check again, whether the thread should run. Otherwise + * we would clear the threads_oneshot bit of this thread which + * was just set. + */ + if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) + goto out_unlock; + + desc->threads_oneshot &= ~action->thread_mask; + + if (!desc->threads_oneshot && !(desc->istate & IRQS_DISABLED) && + (desc->istate & IRQS_MASKED)) { irq_compat_clr_masked(desc); desc->istate &= ~IRQS_MASKED; desc->irq_data.chip->irq_unmask(&desc->irq_data); } +out_unlock: raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(desc); } @@ -691,7 +711,7 @@ static int irq_thread(void *data) }; struct irqaction *action = data; struct irq_desc *desc = irq_to_desc(action->irq); - int wake, oneshot = desc->istate & IRQS_ONESHOT; + int wake; sched_setscheduler(current, SCHED_FIFO, ¶m); current->irqaction = action; @@ -719,8 +739,7 @@ static int irq_thread(void *data) action->thread_fn(action->irq, action->dev_id); - if (oneshot) - irq_finalize_oneshot(action->irq, desc); + irq_finalize_oneshot(desc, action, false); } wake = atomic_dec_and_test(&desc->threads_active); @@ -729,6 +748,9 @@ static int irq_thread(void *data) wake_up(&desc->wait_for_threads); } + /* Prevent a stale desc->threads_oneshot */ + irq_finalize_oneshot(desc, action, true); + /* * Clear irqaction. Otherwise exit_irq_thread() would make * fuzz about an active irq thread going into nirvana. @@ -743,6 +765,7 @@ static int irq_thread(void *data) void exit_irq_thread(void) { struct task_struct *tsk = current; + struct irq_desc *desc; if (!tsk->irqaction) return; @@ -751,6 +774,14 @@ void exit_irq_thread(void) "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq); + desc = irq_to_desc(tsk->irqaction->irq); + + /* + * Prevent a stale desc->threads_oneshot. Must be called + * before setting the IRQTF_DIED flag. + */ + irq_finalize_oneshot(desc, tsk->irqaction, true); + /* * Set the THREAD DIED flag to prevent further wakeups of the * soon to be gone threaded handler. @@ -767,7 +798,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) { struct irqaction *old, **old_ptr; const char *old_name = NULL; - unsigned long flags; + unsigned long flags, thread_mask = 0; int ret, nested, shared = 0; cpumask_var_t mask; @@ -865,12 +896,23 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) /* add new interrupt at end of irq queue */ do { + thread_mask |= old->thread_mask; old_ptr = &old->next; old = *old_ptr; } while (old); shared = 1; } + /* + * Setup the thread mask for this irqaction. Unlikely to have + * 32 resp 64 irqs sharing one line, but who knows. + */ + if (new->flags & IRQF_ONESHOT && thread_mask == ~0UL) { + ret = -EBUSY; + goto out_mask; + } + new->thread_mask = 1 << ffz(thread_mask); + if (!shared) { irq_chip_set_defaults(desc->irq_data.chip); From 9d591edd02a245305b1b9379e4c5571bad4d2774 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Feb 2011 23:52:16 +0000 Subject: [PATCH 488/706] genirq: Allow shared oneshot interrupts Support ONESHOT on shared interrupts, if all drivers agree on it. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20110223234956.483640430@linutronix.de> --- kernel/irq/manage.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 2301de19ac7d..58c861367300 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -824,10 +824,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) rand_initialize_irq(irq); } - /* Oneshot interrupts are not allowed with shared */ - if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED)) - return -EINVAL; - /* * Check whether the interrupt nests into another interrupt * thread. @@ -881,10 +877,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * Can't share interrupts unless both agree to and are * the same type (level, edge, polarity). So both flag * fields must have IRQF_SHARED set and the bits which - * set the trigger type must match. + * set the trigger type must match. Also all must + * agree on ONESHOT. */ if (!((old->flags & new->flags) & IRQF_SHARED) || - ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) { + ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || + ((old->flags ^ new->flags) & IRQF_ONESHOT)) { old_name = old->name; goto mismatch; } From 0c4602ff88d6d6ef0ee6d228ee9acaa6448ff6f5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Feb 2011 23:52:18 +0000 Subject: [PATCH 489/706] genirq: Add IRQF_NO_THREAD Some low level interrupts cannot be threaded even when we force thread all interrupt handlers. Add a flag to annotate such interrupts. Add all timer interrupts to this category by default. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20110223234956.578893460@linutronix.de> --- include/linux/interrupt.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index e116fef274cd..0fc3eb9397b4 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -58,6 +58,7 @@ * irq line disabled until the threaded handler has been run. * IRQF_NO_SUSPEND - Do not disable this IRQ during suspend * IRQF_FORCE_RESUME - Force enable it on resume even if IRQF_NO_SUSPEND is set + * IRQF_NO_THREAD - Interrupt cannot be threaded */ #define IRQF_DISABLED 0x00000020 #define IRQF_SAMPLE_RANDOM 0x00000040 @@ -70,8 +71,9 @@ #define IRQF_ONESHOT 0x00002000 #define IRQF_NO_SUSPEND 0x00004000 #define IRQF_FORCE_RESUME 0x00008000 +#define IRQF_NO_THREAD 0x00010000 -#define IRQF_TIMER (__IRQF_TIMER | IRQF_NO_SUSPEND) +#define IRQF_TIMER (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD) /* * These values can be returned by request_any_context_irq() and From 8eb90c30e0e815a1308828352eabd03ca04229dd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Feb 2011 23:52:21 +0000 Subject: [PATCH 490/706] sched: Switch wait_task_inactive to schedule_hrtimeout() When we force thread hard and soft interrupts the startup of ksoftirqd would hang in kthread_bind() when wait_task_inactive() calls schedule_timeout_uninterruptible() because there is no softirq yet which will wake us up. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20110223234956.677109139@linutronix.de> --- kernel/sched.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/sched.c b/kernel/sched.c index 18d38e4ec7ba..66ca5d9ba83c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2224,7 +2224,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * yield - it could be a while. */ if (unlikely(on_rq)) { - schedule_timeout_uninterruptible(1); + ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_hrtimeout(&to, HRTIMER_MODE_REL); continue; } From 7bf04be8f48ceeeffa5b5a79734d6d6e0d59e5f8 Mon Sep 17 00:00:00 2001 From: Stratos Psomadakis Date: Fri, 25 Feb 2011 22:46:13 +0200 Subject: [PATCH 491/706] x86, asm: Cleanup unnecssary macros in asm-offsets.c PAGE_SIZE_asm, PAGE_SHIFT_asm, THREAD_SIZE_asm can be safely removed from asm-offsets.c, and be replaced by their non-'_asm' counterparts in the code that uses them, since the _AC macro defined in include/linux/const.h makes PAGE_SIZE/PAGE_SHIFT/THREAD_SIZE work with as. Signed-off-by: Stratos Psomadakis LKML-Reference: <1298666774-17646-2-git-send-email-psomas@cslab.ece.ntua.gr> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/asm-offsets.c | 5 ----- arch/x86/kernel/entry_32.S | 2 +- arch/x86/kernel/head_32.S | 8 ++++---- arch/x86/xen/xen-head.S | 4 ++-- 4 files changed, 7 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 2b141c1915a5..4f13fafc5264 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -28,11 +28,6 @@ #endif void common(void) { - BLANK(); - DEFINE(PAGE_SIZE_asm, PAGE_SIZE); - DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT); - DEFINE(THREAD_SIZE_asm, THREAD_SIZE); - BLANK(); OFFSET(TI_flags, thread_info, flags); OFFSET(TI_status, thread_info, status); diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index c8b4efad7ebb..49bdedd2dbcf 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -395,7 +395,7 @@ sysenter_past_esp: * A tiny bit of offset fixup is necessary - 4*4 means the 4 words * pushed above; +8 corresponds to copy_thread's esp0 setting. */ - pushl_cfi ((TI_sysenter_return)-THREAD_SIZE_asm+8+4*4)(%esp) + pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp) CFI_REL_OFFSET eip, 0 pushl_cfi %eax diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 767d6c43de37..187aa63b321f 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -73,7 +73,7 @@ MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT */ KERNEL_PAGES = LOWMEM_PAGES -INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm +INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE RESERVE_BRK(pagetables, INIT_MAP_SIZE) /* @@ -623,7 +623,7 @@ ENTRY(initial_code) * BSS section */ __PAGE_ALIGNED_BSS - .align PAGE_SIZE_asm + .align PAGE_SIZE #ifdef CONFIG_X86_PAE initial_pg_pmd: .fill 1024*KPMDS,4,0 @@ -644,7 +644,7 @@ ENTRY(swapper_pg_dir) #ifdef CONFIG_X86_PAE __PAGE_ALIGNED_DATA /* Page-aligned for the benefit of paravirt? */ - .align PAGE_SIZE_asm + .align PAGE_SIZE ENTRY(initial_page_table) .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ # if KPMDS == 3 @@ -662,7 +662,7 @@ ENTRY(initial_page_table) # else # error "Kernel PMDs should be 1, 2 or 3" # endif - .align PAGE_SIZE_asm /* needs to be page-sized too */ + .align PAGE_SIZE /* needs to be page-sized too */ #endif .data diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 1a5ff24e29c0..aaa7291c9259 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -28,9 +28,9 @@ ENTRY(startup_xen) __FINIT .pushsection .text - .align PAGE_SIZE_asm + .align PAGE_SIZE ENTRY(hypercall_page) - .skip PAGE_SIZE_asm + .skip PAGE_SIZE .popsection ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") From 544b4a1f309d18f40969dbab7e08bafd136b2f55 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 25 Feb 2011 15:13:16 -0800 Subject: [PATCH 492/706] sched: Clean up the IRQ_TIME_ACCOUNTING code Fix this warning: lkml.org/lkml/2011/1/30/124 kernel/sched.c:3719: warning: 'irqtime_account_idle_ticks' defined but not used kernel/sched.c:3720: warning: 'irqtime_account_process_tick' defined but not used In a cleaner way than: 7e9498705e81: sched: Add #ifdef around irq time accounting functions This patch will not have any functional impact. Signed-off-by: Venkatesh Pallipadi Cc: heiko.carstens@de.ibm.com Cc: a.p.zijlstra@chello.nl LKML-Reference: <1298675596-10992-1-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 64 ++++++++++++++++++++++++-------------------------- 1 file changed, 31 insertions(+), 33 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 79bca166bed7..0c8712630f05 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3687,7 +3687,36 @@ void account_system_time(struct task_struct *p, int hardirq_offset, __account_system_time(p, cputime, cputime_scaled, target_cputime64); } +/* + * Account for involuntary wait time. + * @cputime: the cpu time spent in involuntary wait + */ +void account_steal_time(cputime_t cputime) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t cputime64 = cputime_to_cputime64(cputime); + + cpustat->steal = cputime64_add(cpustat->steal, cputime64); +} + +/* + * Account for idle time. + * @cputime: the cpu time spent in idle wait + */ +void account_idle_time(cputime_t cputime) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t cputime64 = cputime_to_cputime64(cputime); + struct rq *rq = this_rq(); + + if (atomic_read(&rq->nr_iowait) > 0) + cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); + else + cpustat->idle = cputime64_add(cpustat->idle, cputime64); +} + #ifndef CONFIG_VIRT_CPU_ACCOUNTING + #ifdef CONFIG_IRQ_TIME_ACCOUNTING /* * Account a tick to a process and cpustat @@ -3749,42 +3778,11 @@ static void irqtime_account_idle_ticks(int ticks) for (i = 0; i < ticks; i++) irqtime_account_process_tick(current, 0, rq); } -#else +#else /* CONFIG_IRQ_TIME_ACCOUNTING */ static void irqtime_account_idle_ticks(int ticks) {} static void irqtime_account_process_tick(struct task_struct *p, int user_tick, struct rq *rq) {} -#endif -#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ - -/* - * Account for involuntary wait time. - * @steal: the cpu time spent in involuntary wait - */ -void account_steal_time(cputime_t cputime) -{ - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t cputime64 = cputime_to_cputime64(cputime); - - cpustat->steal = cputime64_add(cpustat->steal, cputime64); -} - -/* - * Account for idle time. - * @cputime: the cpu time spent in idle wait - */ -void account_idle_time(cputime_t cputime) -{ - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t cputime64 = cputime_to_cputime64(cputime); - struct rq *rq = this_rq(); - - if (atomic_read(&rq->nr_iowait) > 0) - cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); - else - cpustat->idle = cputime64_add(cpustat->idle, cputime64); -} - -#ifndef CONFIG_VIRT_CPU_ACCOUNTING +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ /* * Account a single tick of cpu time. From 8d32a307e4faa8b123dc8a9cd56d1a7525f69ad3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Feb 2011 23:52:23 +0000 Subject: [PATCH 493/706] genirq: Provide forced interrupt threading Add a commandline parameter "threadirqs" which forces all interrupts except those marked IRQF_NO_THREAD to run threaded. That's mostly a debug option to allow retrieving better debug data from crashing interrupt handlers. If "threadirqs" is not enabled on the kernel command line, then there is no impact in the interrupt hotpath. Architecture code needs to select CONFIG_IRQ_FORCED_THREADING after marking the interrupts which cant be threaded IRQF_NO_THREAD. All interrupts which have IRQF_TIMER set are implict marked IRQF_NO_THREAD. Also all PER_CPU interrupts are excluded. Forced threading hard interrupts also forces all soft interrupt handling into thread context. When enabled it might slow down things a bit, but for debugging problems in interrupt code it's a reasonable penalty as it does not immediately crash and burn the machine when an interrupt handler is buggy. Some test results on a Core2Duo machine: Cache cold run of: # time git grep irq_desc non-threaded threaded real 1m18.741s 1m19.061s user 0m1.874s 0m1.757s sys 0m5.843s 0m5.427s # iperf -c server non-threaded [ 3] 0.0-10.0 sec 1.09 GBytes 933 Mbits/sec [ 3] 0.0-10.0 sec 1.09 GBytes 934 Mbits/sec [ 3] 0.0-10.0 sec 1.09 GBytes 933 Mbits/sec threaded [ 3] 0.0-10.0 sec 1.09 GBytes 939 Mbits/sec [ 3] 0.0-10.0 sec 1.09 GBytes 934 Mbits/sec [ 3] 0.0-10.0 sec 1.09 GBytes 937 Mbits/sec Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20110223234956.772668648@linutronix.de> --- Documentation/kernel-parameters.txt | 4 ++ include/linux/interrupt.h | 7 +++ kernel/irq/Kconfig | 3 ++ kernel/irq/internals.h | 2 + kernel/irq/manage.c | 67 +++++++++++++++++++++++++++-- kernel/softirq.c | 16 ++++++- 6 files changed, 93 insertions(+), 6 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 89835a4766a6..cac6cf9a588c 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2436,6 +2436,10 @@ and is between 256 and 4096 characters. It is defined in the file : poll all this frequency 0: no polling (default) + threadirqs [KNL] + Force threading of all interrupt handlers except those + marked explicitely IRQF_NO_THREAD. + topology= [S390] Format: {off | on} Specify if the kernel should make use of the cpu diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 0fc3eb9397b4..f8a8af108e0c 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -383,6 +383,13 @@ static inline int disable_irq_wake(unsigned int irq) } #endif /* CONFIG_GENERIC_HARDIRQS */ + +#ifdef CONFIG_IRQ_FORCED_THREADING +extern bool force_irqthreads; +#else +#define force_irqthreads (0) +#endif + #ifndef __ARCH_SET_SOFTIRQ_PENDING #define set_softirq_pending(x) (local_softirq_pending() = (x)) #define or_softirq_pending(x) (local_softirq_pending() |= (x)) diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 355b8c7957f5..144db9dcfcde 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -38,6 +38,9 @@ config HARDIRQS_SW_RESEND config IRQ_PREFLOW_FASTEOI bool +config IRQ_FORCED_THREADING + bool + config SPARSE_IRQ bool "Support sparse irq numbering" depends on HAVE_SPARSE_IRQ diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 935bec4bfa87..6c6ec9a49027 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -27,12 +27,14 @@ extern int noirqdebug; * IRQTF_DIED - handler thread died * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed * IRQTF_AFFINITY - irq thread is requested to adjust affinity + * IRQTF_FORCED_THREAD - irq action is force threaded */ enum { IRQTF_RUNTHREAD, IRQTF_DIED, IRQTF_WARNED, IRQTF_AFFINITY, + IRQTF_FORCED_THREAD, }; /* diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 58c861367300..acd599a43bfb 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -17,6 +17,17 @@ #include "internals.h" +#ifdef CONFIG_IRQ_FORCED_THREADING +__read_mostly bool force_irqthreads; + +static int __init setup_forced_irqthreads(char *arg) +{ + force_irqthreads = true; + return 0; +} +early_param("threadirqs", setup_forced_irqthreads); +#endif + /** * synchronize_irq - wait for pending IRQ handlers (on other CPUs) * @irq: interrupt number to wait for @@ -701,6 +712,32 @@ static inline void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } #endif +/* + * Interrupts which are not explicitely requested as threaded + * interrupts rely on the implicit bh/preempt disable of the hard irq + * context. So we need to disable bh here to avoid deadlocks and other + * side effects. + */ +static void +irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) +{ + local_bh_disable(); + action->thread_fn(action->irq, action->dev_id); + irq_finalize_oneshot(desc, action, false); + local_bh_enable(); +} + +/* + * Interrupts explicitely requested as threaded interupts want to be + * preemtible - many of them need to sleep and wait for slow busses to + * complete. + */ +static void irq_thread_fn(struct irq_desc *desc, struct irqaction *action) +{ + action->thread_fn(action->irq, action->dev_id); + irq_finalize_oneshot(desc, action, false); +} + /* * Interrupt handler thread */ @@ -711,8 +748,15 @@ static int irq_thread(void *data) }; struct irqaction *action = data; struct irq_desc *desc = irq_to_desc(action->irq); + void (*handler_fn)(struct irq_desc *desc, struct irqaction *action); int wake; + if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD, + &action->thread_flags)) + handler_fn = irq_forced_thread_fn; + else + handler_fn = irq_thread_fn; + sched_setscheduler(current, SCHED_FIFO, ¶m); current->irqaction = action; @@ -736,10 +780,7 @@ static int irq_thread(void *data) raw_spin_unlock_irq(&desc->lock); } else { raw_spin_unlock_irq(&desc->lock); - - action->thread_fn(action->irq, action->dev_id); - - irq_finalize_oneshot(desc, action, false); + handler_fn(desc, action); } wake = atomic_dec_and_test(&desc->threads_active); @@ -789,6 +830,22 @@ void exit_irq_thread(void) set_bit(IRQTF_DIED, &tsk->irqaction->flags); } +static void irq_setup_forced_threading(struct irqaction *new) +{ + if (!force_irqthreads) + return; + if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT)) + return; + + new->flags |= IRQF_ONESHOT; + + if (!new->thread_fn) { + set_bit(IRQTF_FORCED_THREAD, &new->thread_flags); + new->thread_fn = new->handler; + new->handler = irq_default_primary_handler; + } +} + /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. @@ -838,6 +895,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * dummy function which warns when called. */ new->handler = irq_nested_primary_handler; + } else { + irq_setup_forced_threading(new); } /* diff --git a/kernel/softirq.c b/kernel/softirq.c index c0490464e92f..a33fb2911248 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -311,9 +311,21 @@ void irq_enter(void) } #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED -# define invoke_softirq() __do_softirq() +static inline void invoke_softirq(void) +{ + if (!force_irqthreads) + __do_softirq(); + else + wakeup_softirqd(); +} #else -# define invoke_softirq() do_softirq() +static inline void invoke_softirq(void) +{ + if (!force_irqthreads) + do_softirq(); + else + wakeup_softirqd(); +} #endif /* From cc28989437de5617875a2943697fe6ba51a0da8f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 26 Feb 2011 13:05:43 +0100 Subject: [PATCH 494/706] mm: Move early_node_map[] reverse scan helpers under HAVE_MEMBLOCK Heiko found recent memblock change triggers these warnings on s390: mm/page_alloc.c:3623:22: warning: 'last_active_region_index_in_nid' defined but not used mm/page_alloc.c:3638:22: warning: 'previous_active_region_index_in_nid' defined but not used Need to move those two function under HAVE_MEMBLOCK with its only user, find_memory_core_early(). -tj: Minor updates to description. Reported-by: Heiko Carstens Signed-off-by: Yinghai Lu Signed-off-by: Tejun Heo --- mm/page_alloc.c | 64 ++++++++++++++++++++++++------------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 603513609ba5..da0fe32059b3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3616,34 +3616,6 @@ static int __meminit next_active_region_index_in_nid(int index, int nid) return -1; } -/* - * Basic iterator support. Return the last range of PFNs for a node - * Note: nid == MAX_NUMNODES returns last region regardless of node - */ -static int __meminit last_active_region_index_in_nid(int nid) -{ - int i; - - for (i = nr_nodemap_entries - 1; i >= 0; i--) - if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) - return i; - - return -1; -} - -/* - * Basic iterator support. Return the previous active range of PFNs for a node - * Note: nid == MAX_NUMNODES returns next region regardless of node - */ -static int __meminit previous_active_region_index_in_nid(int index, int nid) -{ - for (index = index - 1; index >= 0; index--) - if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) - return index; - - return -1; -} - #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID /* * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. @@ -3695,10 +3667,6 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node) for (i = first_active_region_index_in_nid(nid); i != -1; \ i = next_active_region_index_in_nid(i, nid)) -#define for_each_active_range_index_in_nid_reverse(i, nid) \ - for (i = last_active_region_index_in_nid(nid); i != -1; \ - i = previous_active_region_index_in_nid(i, nid)) - /** * free_bootmem_with_active_regions - Call free_bootmem_node for each active range * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. @@ -3731,6 +3699,38 @@ void __init free_bootmem_with_active_regions(int nid, } #ifdef CONFIG_HAVE_MEMBLOCK +/* + * Basic iterator support. Return the last range of PFNs for a node + * Note: nid == MAX_NUMNODES returns last region regardless of node + */ +static int __meminit last_active_region_index_in_nid(int nid) +{ + int i; + + for (i = nr_nodemap_entries - 1; i >= 0; i--) + if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) + return i; + + return -1; +} + +/* + * Basic iterator support. Return the previous active range of PFNs for a node + * Note: nid == MAX_NUMNODES returns next region regardless of node + */ +static int __meminit previous_active_region_index_in_nid(int index, int nid) +{ + for (index = index - 1; index >= 0; index--) + if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) + return index; + + return -1; +} + +#define for_each_active_range_index_in_nid_reverse(i, nid) \ + for (i = last_active_region_index_in_nid(nid); i != -1; \ + i = previous_active_region_index_in_nid(i, nid)) + u64 __init find_memory_core_early(int nid, u64 size, u64 align, u64 goal, u64 limit) { From a56ec98357ad26b380f1005198de1aa519c9e9cb Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Sun, 27 Feb 2011 19:13:39 +0100 Subject: [PATCH 495/706] x86: dt: Correct local apic documentation in device tree bindings Until "x86: dt: Cleanup local apic setup" we read the local apic address from the MSR and ignored the entry in DT. Reflect this change in the documentation. Signed-off-by: Sebastian Andrzej Siewior LKML-Reference: <1298830419-22681-1-git-send-email-bigeasy@linutronix.de> Signed-off-by: Thomas Gleixner --- Documentation/devicetree/bindings/x86/interrupt.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/Documentation/devicetree/bindings/x86/interrupt.txt b/Documentation/devicetree/bindings/x86/interrupt.txt index 8b0efb097e60..7d19f494f19a 100644 --- a/Documentation/devicetree/bindings/x86/interrupt.txt +++ b/Documentation/devicetree/bindings/x86/interrupt.txt @@ -24,6 +24,3 @@ Interrupt chips Required property: compatible = "intel,ce4100-lapic"; - - This node is currently unused by Linux as the address of the local APIC - read from a MSR. From 39f2205e1abd1b6fffdaf45e1f1c3049a5f8999c Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 28 Feb 2011 15:31:59 +0000 Subject: [PATCH 496/706] x86-64: Add CFI annotations to lib/rwsem_64.S These weren't part of the initial commit of this code. Signed-off-by: Jan Beulich Cc: Alexander van Heukelum LKML-Reference: <4D6BCDFF02000078000341B0@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/lib/rwsem_64.S | 56 +++++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem_64.S index 41fcf00e49df..67743977398b 100644 --- a/arch/x86/lib/rwsem_64.S +++ b/arch/x86/lib/rwsem_64.S @@ -23,43 +23,50 @@ #include #define save_common_regs \ - pushq %rdi; \ - pushq %rsi; \ - pushq %rcx; \ - pushq %r8; \ - pushq %r9; \ - pushq %r10; \ - pushq %r11 + pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \ + pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \ + pushq_cfi %rcx; CFI_REL_OFFSET rcx, 0; \ + pushq_cfi %r8; CFI_REL_OFFSET r8, 0; \ + pushq_cfi %r9; CFI_REL_OFFSET r9, 0; \ + pushq_cfi %r10; CFI_REL_OFFSET r10, 0; \ + pushq_cfi %r11; CFI_REL_OFFSET r11, 0 #define restore_common_regs \ - popq %r11; \ - popq %r10; \ - popq %r9; \ - popq %r8; \ - popq %rcx; \ - popq %rsi; \ - popq %rdi + popq_cfi %r11; CFI_RESTORE r11; \ + popq_cfi %r10; CFI_RESTORE r10; \ + popq_cfi %r9; CFI_RESTORE r9; \ + popq_cfi %r8; CFI_RESTORE r8; \ + popq_cfi %rcx; CFI_RESTORE rcx; \ + popq_cfi %rsi; CFI_RESTORE rsi; \ + popq_cfi %rdi; CFI_RESTORE rdi /* Fix up special calling conventions */ ENTRY(call_rwsem_down_read_failed) + CFI_STARTPROC save_common_regs - pushq %rdx + pushq_cfi %rdx + CFI_REL_OFFSET rdx, 0 movq %rax,%rdi call rwsem_down_read_failed - popq %rdx + popq_cfi %rdx + CFI_RESTORE rdx restore_common_regs ret - ENDPROC(call_rwsem_down_read_failed) + CFI_ENDPROC +ENDPROC(call_rwsem_down_read_failed) ENTRY(call_rwsem_down_write_failed) + CFI_STARTPROC save_common_regs movq %rax,%rdi call rwsem_down_write_failed restore_common_regs ret - ENDPROC(call_rwsem_down_write_failed) + CFI_ENDPROC +ENDPROC(call_rwsem_down_write_failed) ENTRY(call_rwsem_wake) + CFI_STARTPROC decl %edx /* do nothing if still outstanding active readers */ jnz 1f save_common_regs @@ -67,15 +74,20 @@ ENTRY(call_rwsem_wake) call rwsem_wake restore_common_regs 1: ret - ENDPROC(call_rwsem_wake) + CFI_ENDPROC +ENDPROC(call_rwsem_wake) /* Fix up special calling conventions */ ENTRY(call_rwsem_downgrade_wake) + CFI_STARTPROC save_common_regs - pushq %rdx + pushq_cfi %rdx + CFI_REL_OFFSET rdx, 0 movq %rax,%rdi call rwsem_downgrade_wake - popq %rdx + popq_cfi %rdx + CFI_RESTORE rdx restore_common_regs ret - ENDPROC(call_rwsem_downgrade_wake) + CFI_ENDPROC +ENDPROC(call_rwsem_downgrade_wake) From 60cf637a13932a4750da6746efd0199e8a4c341b Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 28 Feb 2011 15:54:40 +0000 Subject: [PATCH 497/706] x86: Use {push,pop}_cfi in more places Cleaning up and shortening code... Signed-off-by: Jan Beulich Cc: Alexander van Heukelum LKML-Reference: <4D6BD35002000078000341DA@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32entry.S | 27 +++++---------- arch/x86/include/asm/frame.h | 6 ++-- arch/x86/kernel/entry_32.S | 3 +- arch/x86/lib/atomic64_386_32.S | 6 ++-- arch/x86/lib/atomic64_cx8_32.S | 6 ++-- arch/x86/lib/checksum_32.S | 63 ++++++++++++---------------------- arch/x86/lib/semaphore_32.S | 36 +++++++------------ 7 files changed, 49 insertions(+), 98 deletions(-) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 518bb99c3394..7c6aabd01fc5 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -126,26 +126,20 @@ ENTRY(ia32_sysenter_target) */ ENABLE_INTERRUPTS(CLBR_NONE) movl %ebp,%ebp /* zero extension */ - pushq $__USER32_DS - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi $__USER32_DS /*CFI_REL_OFFSET ss,0*/ - pushq %rbp - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rbp CFI_REL_OFFSET rsp,0 - pushfq - CFI_ADJUST_CFA_OFFSET 8 + pushfq_cfi /*CFI_REL_OFFSET rflags,0*/ movl 8*3-THREAD_SIZE+TI_sysenter_return(%rsp), %r10d CFI_REGISTER rip,r10 - pushq $__USER32_CS - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi $__USER32_CS /*CFI_REL_OFFSET cs,0*/ movl %eax, %eax - pushq %r10 - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %r10 CFI_REL_OFFSET rip,0 - pushq %rax - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rax cld SAVE_ARGS 0,0,1 /* no need to do an access_ok check here because rbp has been @@ -182,11 +176,9 @@ sysexit_from_sys_call: xorq %r9,%r9 xorq %r10,%r10 xorq %r11,%r11 - popfq - CFI_ADJUST_CFA_OFFSET -8 + popfq_cfi /*CFI_RESTORE rflags*/ - popq %rcx /* User %esp */ - CFI_ADJUST_CFA_OFFSET -8 + popq_cfi %rcx /* User %esp */ CFI_REGISTER rsp,rcx TRACE_IRQS_ON ENABLE_INTERRUPTS_SYSEXIT32 @@ -421,8 +413,7 @@ ENTRY(ia32_syscall) */ ENABLE_INTERRUPTS(CLBR_NONE) movl %eax,%eax - pushq %rax - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rax cld /* note the registers are not zero extended to the sf. this could be a problem. */ diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h index 06850a7194e1..2c6fc9e62812 100644 --- a/arch/x86/include/asm/frame.h +++ b/arch/x86/include/asm/frame.h @@ -7,14 +7,12 @@ frame pointer later */ #ifdef CONFIG_FRAME_POINTER .macro FRAME - pushl %ebp - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebp CFI_REL_OFFSET ebp,0 movl %esp,%ebp .endm .macro ENDFRAME - popl %ebp - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ebp CFI_RESTORE ebp .endm #else diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 49bdedd2dbcf..2878821cb8c1 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1409,8 +1409,7 @@ END(general_protection) #ifdef CONFIG_KVM_GUEST ENTRY(async_page_fault) RING0_EC_FRAME - pushl $do_async_page_fault - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $do_async_page_fault jmp error_code CFI_ENDPROC END(apf_page_fault) diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S index 2cda60a06e65..e8e7e0d06f42 100644 --- a/arch/x86/lib/atomic64_386_32.S +++ b/arch/x86/lib/atomic64_386_32.S @@ -15,14 +15,12 @@ /* if you want SMP support, implement these with real spinlocks */ .macro LOCK reg - pushfl - CFI_ADJUST_CFA_OFFSET 4 + pushfl_cfi cli .endm .macro UNLOCK reg - popfl - CFI_ADJUST_CFA_OFFSET -4 + popfl_cfi .endm #define BEGIN(op) \ diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S index 71e080de3352..391a083674b4 100644 --- a/arch/x86/lib/atomic64_cx8_32.S +++ b/arch/x86/lib/atomic64_cx8_32.S @@ -14,14 +14,12 @@ #include .macro SAVE reg - pushl %\reg - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %\reg CFI_REL_OFFSET \reg, 0 .endm .macro RESTORE reg - popl %\reg - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %\reg CFI_RESTORE \reg .endm diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index adbccd0bbb78..78d16a554db0 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S @@ -50,11 +50,9 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) */ ENTRY(csum_partial) CFI_STARTPROC - pushl %esi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %esi CFI_REL_OFFSET esi, 0 - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebx CFI_REL_OFFSET ebx, 0 movl 20(%esp),%eax # Function arg: unsigned int sum movl 16(%esp),%ecx # Function arg: int len @@ -132,11 +130,9 @@ ENTRY(csum_partial) jz 8f roll $8, %eax 8: - popl %ebx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ebx CFI_RESTORE ebx - popl %esi - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %esi CFI_RESTORE esi ret CFI_ENDPROC @@ -148,11 +144,9 @@ ENDPROC(csum_partial) ENTRY(csum_partial) CFI_STARTPROC - pushl %esi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %esi CFI_REL_OFFSET esi, 0 - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebx CFI_REL_OFFSET ebx, 0 movl 20(%esp),%eax # Function arg: unsigned int sum movl 16(%esp),%ecx # Function arg: int len @@ -260,11 +254,9 @@ ENTRY(csum_partial) jz 90f roll $8, %eax 90: - popl %ebx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ebx CFI_RESTORE ebx - popl %esi - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %esi CFI_RESTORE esi ret CFI_ENDPROC @@ -309,14 +301,11 @@ ENTRY(csum_partial_copy_generic) CFI_STARTPROC subl $4,%esp CFI_ADJUST_CFA_OFFSET 4 - pushl %edi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %edi CFI_REL_OFFSET edi, 0 - pushl %esi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %esi CFI_REL_OFFSET esi, 0 - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebx CFI_REL_OFFSET ebx, 0 movl ARGBASE+16(%esp),%eax # sum movl ARGBASE+12(%esp),%ecx # len @@ -426,17 +415,13 @@ DST( movb %cl, (%edi) ) .previous - popl %ebx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ebx CFI_RESTORE ebx - popl %esi - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %esi CFI_RESTORE esi - popl %edi - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %edi CFI_RESTORE edi - popl %ecx # equivalent to addl $4,%esp - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ecx # equivalent to addl $4,%esp ret CFI_ENDPROC ENDPROC(csum_partial_copy_generic) @@ -459,14 +444,11 @@ ENDPROC(csum_partial_copy_generic) ENTRY(csum_partial_copy_generic) CFI_STARTPROC - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebx CFI_REL_OFFSET ebx, 0 - pushl %edi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %edi CFI_REL_OFFSET edi, 0 - pushl %esi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %esi CFI_REL_OFFSET esi, 0 movl ARGBASE+4(%esp),%esi #src movl ARGBASE+8(%esp),%edi #dst @@ -527,14 +509,11 @@ DST( movb %dl, (%edi) ) jmp 7b .previous - popl %esi - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %esi CFI_RESTORE esi - popl %edi - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %edi CFI_RESTORE edi - popl %ebx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ebx CFI_RESTORE ebx ret CFI_ENDPROC diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S index 648fe4741782..48e44f7ed76e 100644 --- a/arch/x86/lib/semaphore_32.S +++ b/arch/x86/lib/semaphore_32.S @@ -74,29 +74,23 @@ ENTRY(__read_lock_failed) /* Fix up special calling conventions */ ENTRY(call_rwsem_down_read_failed) CFI_STARTPROC - push %ecx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ecx CFI_REL_OFFSET ecx,0 - push %edx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %edx CFI_REL_OFFSET edx,0 call rwsem_down_read_failed - pop %edx - CFI_ADJUST_CFA_OFFSET -4 - pop %ecx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %edx + popl_cfi %ecx ret CFI_ENDPROC ENDPROC(call_rwsem_down_read_failed) ENTRY(call_rwsem_down_write_failed) CFI_STARTPROC - push %ecx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ecx CFI_REL_OFFSET ecx,0 calll rwsem_down_write_failed - pop %ecx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ecx ret CFI_ENDPROC ENDPROC(call_rwsem_down_write_failed) @@ -105,12 +99,10 @@ ENTRY(call_rwsem_wake) CFI_STARTPROC decw %dx /* do nothing if still outstanding active readers */ jnz 1f - push %ecx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ecx CFI_REL_OFFSET ecx,0 call rwsem_wake - pop %ecx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ecx 1: ret CFI_ENDPROC ENDPROC(call_rwsem_wake) @@ -118,17 +110,13 @@ ENTRY(call_rwsem_wake) /* Fix up special calling conventions */ ENTRY(call_rwsem_downgrade_wake) CFI_STARTPROC - push %ecx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ecx CFI_REL_OFFSET ecx,0 - push %edx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %edx CFI_REL_OFFSET edx,0 call rwsem_downgrade_wake - pop %edx - CFI_ADJUST_CFA_OFFSET -4 - pop %ecx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %edx + popl_cfi %ecx ret CFI_ENDPROC ENDPROC(call_rwsem_downgrade_wake) From 039e13890b0615cb8c5c04b6afa84d676e24c761 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 28 Feb 2011 15:56:00 +0000 Subject: [PATCH 498/706] x86: Remove unused bits from lib/thunk_*.S Some of the items removed were apparently never used, others simply didn't get removed with their last user. Signed-off-by: Jan Beulich LKML-Reference: <4D6BD3A002000078000341F1@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/lib/thunk_32.S | 18 ------------------ arch/x86/lib/thunk_64.S | 27 --------------------------- 2 files changed, 45 deletions(-) diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S index 650b11e00ecc..2930ae05d773 100644 --- a/arch/x86/lib/thunk_32.S +++ b/arch/x86/lib/thunk_32.S @@ -7,24 +7,6 @@ #include -#define ARCH_TRACE_IRQS_ON \ - pushl %eax; \ - pushl %ecx; \ - pushl %edx; \ - call trace_hardirqs_on; \ - popl %edx; \ - popl %ecx; \ - popl %eax; - -#define ARCH_TRACE_IRQS_OFF \ - pushl %eax; \ - pushl %ecx; \ - pushl %edx; \ - call trace_hardirqs_off; \ - popl %edx; \ - popl %ecx; \ - popl %eax; - #ifdef CONFIG_TRACE_IRQFLAGS /* put return address in eax (arg1) */ .macro thunk_ra name,func diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S index bf9a7d5a5428..782b082c9ff7 100644 --- a/arch/x86/lib/thunk_64.S +++ b/arch/x86/lib/thunk_64.S @@ -22,26 +22,6 @@ CFI_ENDPROC .endm - /* rdi: arg1 ... normal C conventions. rax is passed from C. */ - .macro thunk_retrax name,func - .globl \name -\name: - CFI_STARTPROC - SAVE_ARGS - call \func - jmp restore_norax - CFI_ENDPROC - .endm - - - .section .sched.text, "ax" -#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM - thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed - thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed - thunk rwsem_wake_thunk,rwsem_wake - thunk rwsem_downgrade_thunk,rwsem_downgrade_wake -#endif - #ifdef CONFIG_TRACE_IRQFLAGS /* put return address in rdi (arg1) */ .macro thunk_ra name,func @@ -72,10 +52,3 @@ restore: RESTORE_ARGS ret CFI_ENDPROC - - CFI_STARTPROC - SAVE_ARGS -restore_norax: - RESTORE_ARGS 1 - ret - CFI_ENDPROC From bfc39061d3dbf812e6a78f9529a548e5f0050c64 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 1 Mar 2011 11:14:55 +0000 Subject: [PATCH 499/706] um, x86-64: Fix UML build after adding CFI annotations to lib/rwsem_64.S arch/um/Kconfig.x86 has X86_32 but not X86_64 - that's resulting in asm/dwarf2.h producing the 32-bit (pushl_cfi & Co) macros instead of the 64-bit ones. Signed-off-by: Jan Beulich Cc: Jeff Dike LKML-Reference: <4D6CE3400200007800034498@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/um/Kconfig.x86 | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/um/Kconfig.x86 b/arch/um/Kconfig.x86 index 5ee328099c63..62f21156aaa4 100644 --- a/arch/um/Kconfig.x86 +++ b/arch/um/Kconfig.x86 @@ -19,6 +19,9 @@ config X86_32 def_bool !64BIT select HAVE_AOUT +config X86_64 + def_bool 64BIT + config RWSEM_XCHGADD_ALGORITHM def_bool X86_XADD From 3166fc8fb6a2f52273d545e970297524e02c3e39 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 1 Mar 2011 10:21:44 -0300 Subject: [PATCH 500/706] perf top browser: Handle empty active symbols list Fixing a SEGV. An empty list could happen when not being able to resolve symbols, for instance when --vmlinux invalid-file is used. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/ui/browsers/top.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/perf/util/ui/browsers/top.c b/tools/perf/util/ui/browsers/top.c index e9381ec9bfb1..5a06538532af 100644 --- a/tools/perf/util/ui/browsers/top.c +++ b/tools/perf/util/ui/browsers/top.c @@ -76,6 +76,12 @@ static void perf_top_browser__update_rb_tree(struct perf_top_browser *browser) browser->root = RB_ROOT; browser->b.top = NULL; browser->sum_ksamples = perf_top__decay_samples(top, &browser->root); + /* + * No active symbols + */ + if (top->rb_entries == 0) + return; + perf_top__find_widths(top, &browser->root, &browser->dso_width, &browser->dso_short_width, &browser->sym_width); From a1ceb741cf86ef433006379742db81c00b450bae Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 1 Mar 2011 10:24:43 -0300 Subject: [PATCH 501/706] perf tui: Make ui__warning modal By taking the ui__lock so that no other screen updates take place while waiting for the user. That was happening when handling an invalid --vmlinux parameter in 'perf top --tui', with the screen refresh routine repainting the screen and removing the warning window. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/ui/util.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/ui/util.c b/tools/perf/util/ui/util.c index 7b5a8926624e..fdf1fc8f08bc 100644 --- a/tools/perf/util/ui/util.c +++ b/tools/perf/util/ui/util.c @@ -9,6 +9,7 @@ #include "../debug.h" #include "browser.h" #include "helpline.h" +#include "ui.h" #include "util.h" static void newt_form__set_exit_keys(newtComponent self) @@ -118,10 +119,12 @@ void ui__warning(const char *format, ...) va_list args; va_start(args, format); - if (use_browser > 0) + if (use_browser > 0) { + pthread_mutex_lock(&ui__lock); newtWinMessagev((char *)warning_str, (char *)ok, (char *)format, args); - else + pthread_mutex_unlock(&ui__lock); + } else vfprintf(stderr, format, args); va_end(args); } From 374cfe56892701f062586d6a6de6cb71777a4184 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 1 Mar 2011 10:27:27 -0300 Subject: [PATCH 502/706] perf top: Fix reporting of invalid --vmlinux Using ui__warning, that will, in --tui, show a window with the message, waiting for the user to press Ok. Also run exit_browser() to let newt do its final cleaning of the screen. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-top.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index f88a2630e1fc..0b07cc30b669 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -740,8 +740,9 @@ static void perf_event__process_sample(const union perf_event *event, */ if (al.map == machine->vmlinux_maps[MAP__FUNCTION] && RB_EMPTY_ROOT(&al.map->dso->symbols[MAP__FUNCTION])) { - pr_err("The %s file can't be used\n", - symbol_conf.vmlinux_name); + ui__warning("The %s file can't be used\n", + symbol_conf.vmlinux_name); + exit_browser(0); exit(1); } From 5807806a92450fd57f8063868efae9d4af74db02 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 1 Mar 2011 10:43:03 -0300 Subject: [PATCH 503/706] perf top tui: Wait till the first sample to refresh the screen. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-top.c | 21 +++++++++++++++++++-- tools/perf/util/top.h | 1 + 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 0b07cc30b669..417f757e3cbe 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -72,6 +72,7 @@ static struct perf_top top = { .target_tid = -1, .active_symbols = LIST_HEAD_INIT(top.active_symbols), .active_symbols_lock = PTHREAD_MUTEX_INITIALIZER, + .active_symbols_cond = PTHREAD_COND_INITIALIZER, .freq = 1000, /* 1 KHz */ }; @@ -577,7 +578,17 @@ static void handle_keypress(struct perf_session *session, int c) static void *display_thread_tui(void *arg __used) { - perf_top__tui_browser(&top); + int err = 0; + pthread_mutex_lock(&top.active_symbols_lock); + while (list_empty(&top.active_symbols)) { + err = pthread_cond_wait(&top.active_symbols_cond, + &top.active_symbols_lock); + if (err) + break; + } + pthread_mutex_unlock(&top.active_symbols_lock); + if (!err) + perf_top__tui_browser(&top); exit_browser(0); exit(0); return NULL; @@ -776,8 +787,14 @@ static void perf_event__process_sample(const union perf_event *event, syme->count[evsel->idx]++; record_precise_ip(syme, evsel->idx, ip); pthread_mutex_lock(&top.active_symbols_lock); - if (list_empty(&syme->node) || !syme->node.next) + if (list_empty(&syme->node) || !syme->node.next) { + static bool first = true; __list_insert_active_sym(syme); + if (first) { + pthread_cond_broadcast(&top.active_symbols_cond); + first = false; + } + } pthread_mutex_unlock(&top.active_symbols_lock); } } diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h index e8d28e2ecdba..96d1cb78af01 100644 --- a/tools/perf/util/top.h +++ b/tools/perf/util/top.h @@ -35,6 +35,7 @@ struct perf_top { */ struct list_head active_symbols; pthread_mutex_t active_symbols_lock; + pthread_cond_t active_symbols_cond; u64 samples; u64 kernel_samples, us_samples; u64 exact_samples; From 44e69767cb7c3bc46e5370c39532c205d4347d80 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Tue, 1 Mar 2011 20:05:49 +0000 Subject: [PATCH 504/706] xen: ia64 build broken due to "xen: switch to new schedop hypercall by default." The git commit: > commit a8b7458363b9174f3c2196ca6085630b4b30b7a1 > Author: Ian Campbell > Date: Thu Feb 17 11:04:20 2011 +0000 > > xen: switch to new schedop hypercall by default. > > Rename old interface to sched_op_compat and rename sched_op_new to > simply sched_op. > breaks the IA64 build. This patch fixes it. Signed-off-by: Tony Luck Signed-off-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- arch/ia64/include/asm/xen/hypercall.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/ia64/include/asm/xen/hypercall.h b/arch/ia64/include/asm/xen/hypercall.h index 96fc62366aa4..ed28bcd5bb85 100644 --- a/arch/ia64/include/asm/xen/hypercall.h +++ b/arch/ia64/include/asm/xen/hypercall.h @@ -107,7 +107,7 @@ extern unsigned long __hypercall(unsigned long a1, unsigned long a2, static inline int xencomm_arch_hypercall_sched_op(int cmd, struct xencomm_handle *arg) { - return _hypercall2(int, sched_op_new, cmd, arg); + return _hypercall2(int, sched_op, cmd, arg); } static inline long From e938c287ea8d977e079f07464ac69923412663ce Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 1 Mar 2011 14:28:02 +0000 Subject: [PATCH 505/706] x86: Fix a bogus unwind annotation in lib/semaphore_32.S 'simple' would have required specifying current frame address and return address location manually, but that's obviously not the case (and not necessary) here. Signed-off-by: Jan Beulich LKML-Reference: <4D6D1082020000780003454C@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/lib/semaphore_32.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S index 48e44f7ed76e..06691daa4108 100644 --- a/arch/x86/lib/semaphore_32.S +++ b/arch/x86/lib/semaphore_32.S @@ -36,7 +36,7 @@ */ #ifdef CONFIG_SMP ENTRY(__write_lock_failed) - CFI_STARTPROC simple + CFI_STARTPROC FRAME 2: LOCK_PREFIX addl $ RW_LOCK_BIAS,(%eax) From c69e3758ff56d03e161187355791ec992c574276 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 2 Mar 2011 11:49:21 +0100 Subject: [PATCH 506/706] genirq: Fixup fasteoi handler for oneshot mode The fasteoi handler must mask the interrupt line in oneshot mode otherwise we end up with an irq storm. Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b5145654855f..c9c0601f0615 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -513,6 +513,10 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) mask_irq(desc); goto out; } + + if (desc->istate & IRQS_ONESHOT) + mask_irq(desc); + preflow_handler(desc); handle_irq_event(desc); From b06b3d49699a52e8f9ca056c4f96e81b1987d78e Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Wed, 2 Mar 2011 21:27:04 +0800 Subject: [PATCH 507/706] perf, x86: Add Intel SandyBridge CPU support This patch adds basic SandyBridge support, including hardware cache events and PEBS events support. It has been tested on SandyBridge CPUs with perf stat and also with PEBS based profiling - both work fine. The patch does not affect other models. v2 -> v3: - fix PEBS event 0xd0 with right umask combinations - move snb pebs constraint assignment to intel_pmu_init v1 -> v2: - add more raw and PEBS events constraints - use offcore events for LLC-* cache events - remove the call to Nehalem workaround enable_all function Signed-off-by: Lin Ming Acked-by: Peter Zijlstra Cc: Stephane Eranian Cc: Andi Kleen LKML-Reference: <1299072424.2175.24.camel@localhost> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 4 +- arch/x86/kernel/cpu/perf_event_intel.c | 124 ++++++++++++++++++++++ arch/x86/kernel/cpu/perf_event_intel_ds.c | 38 +++++++ 3 files changed, 165 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 9d977a2ea693..390fa6d8c140 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -166,8 +166,10 @@ struct cpu_hw_events { /* * Constraint on the Event code + UMask */ -#define PEBS_EVENT_CONSTRAINT(c, n) \ +#define INTEL_UEVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) +#define PEBS_EVENT_CONSTRAINT(c, n) \ + INTEL_UEVENT_CONSTRAINT(c, n) #define EVENT_CONSTRAINT_END \ EVENT_CONSTRAINT(0, 0, 0) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 008835c1d79c..d00f386b0b30 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -76,6 +76,19 @@ static struct event_constraint intel_westmere_event_constraints[] = EVENT_CONSTRAINT_END }; +static struct event_constraint intel_snb_event_constraints[] = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */ + INTEL_EVENT_CONSTRAINT(0xb7, 0x1), /* OFF_CORE_RESPONSE_0 */ + INTEL_EVENT_CONSTRAINT(0xbb, 0x8), /* OFF_CORE_RESPONSE_1 */ + INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ + INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ + EVENT_CONSTRAINT_END +}; + static struct event_constraint intel_gen_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ @@ -89,6 +102,106 @@ static u64 intel_pmu_event_map(int hw_event) return intel_perfmon_event_map[hw_event]; } +static __initconst const u64 snb_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0xf1d0, /* MEM_UOP_RETIRED.LOADS */ + [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPLACEMENT */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0xf2d0, /* MEM_UOP_RETIRED.STORES */ + [ C(RESULT_MISS) ] = 0x0851, /* L1D.ALL_M_REPLACEMENT */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x024e, /* HW_PRE_REQ.DL1_MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0280, /* ICACHE.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + /* + * TBD: Need Off-core Response Performance Monitoring support + */ + [ C(OP_READ) ] = { + /* OFFCORE_RESPONSE_0.ANY_DATA.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE_1.ANY_DATA.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01bb, + }, + [ C(OP_WRITE) ] = { + /* OFFCORE_RESPONSE_0.ANY_RFO.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE_1.ANY_RFO.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01bb, + }, + [ C(OP_PREFETCH) ] = { + /* OFFCORE_RESPONSE_0.PREFETCH.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE_1.PREFETCH.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01bb, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOP_RETIRED.ALL_LOADS */ + [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.CAUSES_A_WALK */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOP_RETIRED.ALL_STORES */ + [ C(RESULT_MISS) ] = 0x0149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x1085, /* ITLB_MISSES.STLB_HIT */ + [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.CAUSES_A_WALK */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + static __initconst const u64 westmere_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -1062,6 +1175,17 @@ static __init int intel_pmu_init(void) pr_cont("Westmere events, "); break; + case 42: /* SandyBridge */ + memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + intel_pmu_lbr_init_nhm(); + + x86_pmu.event_constraints = intel_snb_event_constraints; + x86_pmu.pebs_constraints = intel_snb_pebs_events; + pr_cont("SandyBridge events, "); + break; + default: /* * default constraints for v2 and up diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index b7dcd9f2b8a0..825199834885 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -388,6 +388,44 @@ static struct event_constraint intel_nehalem_pebs_events[] = { EVENT_CONSTRAINT_END }; +static struct event_constraint intel_snb_pebs_events[] = { + PEBS_EVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ + PEBS_EVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ + PEBS_EVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */ + PEBS_EVENT_CONSTRAINT(0x01c4, 0xf), /* BR_INST_RETIRED.CONDITIONAL */ + PEBS_EVENT_CONSTRAINT(0x02c4, 0xf), /* BR_INST_RETIRED.NEAR_CALL */ + PEBS_EVENT_CONSTRAINT(0x04c4, 0xf), /* BR_INST_RETIRED.ALL_BRANCHES */ + PEBS_EVENT_CONSTRAINT(0x08c4, 0xf), /* BR_INST_RETIRED.NEAR_RETURN */ + PEBS_EVENT_CONSTRAINT(0x10c4, 0xf), /* BR_INST_RETIRED.NOT_TAKEN */ + PEBS_EVENT_CONSTRAINT(0x20c4, 0xf), /* BR_INST_RETIRED.NEAR_TAKEN */ + PEBS_EVENT_CONSTRAINT(0x40c4, 0xf), /* BR_INST_RETIRED.FAR_BRANCH */ + PEBS_EVENT_CONSTRAINT(0x01c5, 0xf), /* BR_MISP_RETIRED.CONDITIONAL */ + PEBS_EVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */ + PEBS_EVENT_CONSTRAINT(0x04c5, 0xf), /* BR_MISP_RETIRED.ALL_BRANCHES */ + PEBS_EVENT_CONSTRAINT(0x10c5, 0xf), /* BR_MISP_RETIRED.NOT_TAKEN */ + PEBS_EVENT_CONSTRAINT(0x20c5, 0xf), /* BR_MISP_RETIRED.TAKEN */ + PEBS_EVENT_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ + PEBS_EVENT_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORE */ + PEBS_EVENT_CONSTRAINT(0x11d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_LOADS */ + PEBS_EVENT_CONSTRAINT(0x12d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_STORES */ + PEBS_EVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOP_RETIRED.LOCK_LOADS */ + PEBS_EVENT_CONSTRAINT(0x22d0, 0xf), /* MEM_UOP_RETIRED.LOCK_STORES */ + PEBS_EVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_LOADS */ + PEBS_EVENT_CONSTRAINT(0x42d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_STORES */ + PEBS_EVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOP_RETIRED.ANY_LOADS */ + PEBS_EVENT_CONSTRAINT(0x82d0, 0xf), /* MEM_UOP_RETIRED.ANY_STORES */ + PEBS_EVENT_CONSTRAINT(0x01d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L1_HIT */ + PEBS_EVENT_CONSTRAINT(0x02d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L2_HIT */ + PEBS_EVENT_CONSTRAINT(0x04d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.LLC_HIT */ + PEBS_EVENT_CONSTRAINT(0x40d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.HIT_LFB */ + PEBS_EVENT_CONSTRAINT(0x01d2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS */ + PEBS_EVENT_CONSTRAINT(0x02d2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT */ + PEBS_EVENT_CONSTRAINT(0x04d2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM */ + PEBS_EVENT_CONSTRAINT(0x08d2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_NONE */ + PEBS_EVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */ + EVENT_CONSTRAINT_END +}; + static struct event_constraint * intel_pebs_constraints(struct perf_event *event) { From 0a10247914a5cad3caf7ef8a255c54c4d3ed2062 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 26 Feb 2011 04:51:54 +0100 Subject: [PATCH 508/706] perf: Set filters before mmaping events We currently set the filters after we mmap the events, this is a race that let undesired events record themselves in the buffer before we had the time to set the filters. So set the filters before they can be recorded. That also librarizes the filters setting so that filtering can be done more easily from other tools than perf record later. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt --- tools/perf/builtin-record.c | 18 ++++++------------ tools/perf/util/evlist.c | 28 ++++++++++++++++++++++++++++ tools/perf/util/evlist.h | 1 + 3 files changed, 35 insertions(+), 12 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index db4cd1e7b51a..d40a81e8cc56 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -180,12 +180,10 @@ static struct perf_header_attr *get_header_attr(struct perf_event_attr *a, int n static void create_counter(struct perf_evsel *evsel, int cpu) { - char *filter = evsel->filter; struct perf_event_attr *attr = &evsel->attr; struct perf_header_attr *h_attr; struct perf_sample_id *sid; int thread_index; - int ret; for (thread_index = 0; thread_index < evsel_list->threads->nr; thread_index++) { h_attr = get_header_attr(attr, evsel->idx); @@ -204,16 +202,6 @@ static void create_counter(struct perf_evsel *evsel, int cpu) pr_warning("Not enough memory to add id\n"); exit(-1); } - - if (filter != NULL) { - ret = ioctl(FD(evsel, cpu, thread_index), - PERF_EVENT_IOC_SET_FILTER, filter); - if (ret) { - error("failed to set filter with %d (%s)\n", errno, - strerror(errno)); - exit(-1); - } - } } if (!sample_type) @@ -367,6 +355,12 @@ try_again: } } + if (perf_evlist__set_filters(evlist)) { + error("failed to set filter with %d (%s)\n", errno, + strerror(errno)); + exit(-1); + } + if (perf_evlist__mmap(evlist, mmap_pages, false) < 0) die("failed to mmap with %d (%s)\n", errno, strerror(errno)); diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 95b21fece2ce..030ae7f05e03 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -348,3 +348,31 @@ void perf_evlist__delete_maps(struct perf_evlist *evlist) evlist->cpus = NULL; evlist->threads = NULL; } + +int perf_evlist__set_filters(struct perf_evlist *evlist) +{ + const struct thread_map *threads = evlist->threads; + const struct cpu_map *cpus = evlist->cpus; + struct perf_evsel *evsel; + char *filter; + int thread; + int cpu; + int err; + int fd; + + list_for_each_entry(evsel, &evlist->entries, node) { + filter = evsel->filter; + if (!filter) + continue; + for (cpu = 0; cpu < cpus->nr; cpu++) { + for (thread = 0; thread < threads->nr; thread++) { + fd = FD(evsel, cpu, thread); + err = ioctl(fd, PERF_EVENT_IOC_SET_FILTER, filter); + if (err) + return err; + } + } + } + + return 0; +} diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index c9884056097c..b75805aeb7e4 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -60,5 +60,6 @@ static inline void perf_evlist__set_maps(struct perf_evlist *evlist, int perf_evlist__create_maps(struct perf_evlist *evlist, pid_t target_pid, pid_t target_tid, const char *cpu_list); void perf_evlist__delete_maps(struct perf_evlist *evlist); +int perf_evlist__set_filters(struct perf_evlist *evlist); #endif /* __PERF_EVLIST_H */ From ce0033307f1b45e23e0c149f56ea4855eb4687ce Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 2 Mar 2011 11:22:14 +0100 Subject: [PATCH 509/706] x86-64, NUMA: Fix distance table handling NUMA distance table handling has the following problems. * numa_reset_distance() uses numa_distance * sizeof(numa_distance[0]) as the table size when it should be using the square of numa_distance. * The same size miscalculation when allocation space for phys_dist in numa_emulation(). * In numa_emulation(), phys_dist must be reserved; otherwise, the new emulated distance table may overlap it. Fix them and, while at it, take numa_distance_cnt resetting in numa_reset_distance() out of the if block to simplify the code a bit. David Rientjes reported incorrect handling of distance table during emulation. -tj: Edited out numa_alloc_distance() related changes which weren't necessary and rewrote patch description. -v2: Ingo was unhappy with 80-column limit induced linebreaks. Let lines run over 80-column. Signed-off-by: Yinghai Lu Reported-by: David Rientjes Signed-off-by: Tejun Heo Cc: Ingo Molnar Acked-by: David Rientjes --- arch/x86/mm/numa_64.c | 8 +++----- arch/x86/mm/numa_emulation.c | 14 ++++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 7757d2214fab..541746fdeb4b 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -390,14 +390,12 @@ static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, */ void __init numa_reset_distance(void) { - size_t size; + size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); - if (numa_distance_cnt) { - size = numa_distance_cnt * sizeof(numa_distance[0]); + if (numa_distance_cnt) memblock_x86_free_range(__pa(numa_distance), __pa(numa_distance) + size); - numa_distance_cnt = 0; - } + numa_distance_cnt = 0; numa_distance = NULL; } diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index 607a2e8bc87a..0afa25d967ba 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -300,6 +300,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) static struct numa_meminfo pi __initdata; const u64 max_addr = max_pfn << PAGE_SHIFT; u8 *phys_dist = NULL; + size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); int i, j, ret; if (!emu_cmdline) @@ -336,21 +337,18 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) goto no_emu; } - /* - * Copy the original distance table. It's temporary so no need to - * reserve it. - */ + /* copy the physical distance table */ if (numa_dist_cnt) { - size_t size = numa_dist_cnt * sizeof(phys_dist[0]); u64 phys; phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT, - size, PAGE_SIZE); + phys_size, PAGE_SIZE); if (phys == MEMBLOCK_ERROR) { pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); goto no_emu; } + memblock_x86_reserve_range(phys, phys + phys_size, "TMP NUMA DIST"); phys_dist = __va(phys); for (i = 0; i < numa_dist_cnt; i++) @@ -398,6 +396,10 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) numa_set_distance(i, j, dist); } } + + /* free the copied physical distance table */ + if (phys_dist) + memblock_x86_free_range(__pa(phys_dist), __pa(phys_dist) + phys_size); return; no_emu: From eb8c1e2c830fc25c93bc94e215ed387fe142a98d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 2 Mar 2011 11:32:47 +0100 Subject: [PATCH 510/706] x86-64, NUMA: Better explain numa_distance handling Handling of out-of-bounds distances and allocation failure can use better documentation. Add it. Signed-off-by: Tejun Heo Cc: Yinghai Lu Acked-by: David Rientjes --- arch/x86/mm/numa_64.c | 11 ++++++++++- arch/x86/mm/numa_emulation.c | 6 +++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 541746fdeb4b..74064e8ae79f 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -392,11 +392,12 @@ void __init numa_reset_distance(void) { size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); + /* numa_distance could be 1LU marking allocation failure, test cnt */ if (numa_distance_cnt) memblock_x86_free_range(__pa(numa_distance), __pa(numa_distance) + size); numa_distance_cnt = 0; - numa_distance = NULL; + numa_distance = NULL; /* enable table creation */ } static int __init numa_alloc_distance(void) @@ -447,6 +448,14 @@ static int __init numa_alloc_distance(void) * Set the distance from node @from to @to to @distance. If distance table * doesn't exist, one which is large enough to accomodate all the currently * known nodes will be created. + * + * If such table cannot be allocated, a warning is printed and further + * calls are ignored until the distance table is reset with + * numa_reset_distance(). + * + * If @from or @to is higher than the highest known node at the time of + * table creation or @distance doesn't make sense, the call is ignored. + * This is to allow simplification of specific NUMA config implementations. */ void __init numa_set_distance(int from, int to, int distance) { diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index 0afa25d967ba..aeecea93820f 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -379,7 +379,11 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) if (emu_nid_to_phys[i] == NUMA_NO_NODE) emu_nid_to_phys[i] = 0; - /* transform distance table */ + /* + * Transform distance table. numa_set_distance() ignores all + * out-of-bound distances. Just call it for every possible node + * combination. + */ numa_reset_distance(); for (i = 0; i < MAX_NUMNODES; i++) { for (j = 0; j < MAX_NUMNODES; j++) { From 5cd10e7946d28cfc42442fee2e6c757e244d756e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 2 Mar 2011 16:58:30 +0100 Subject: [PATCH 511/706] hrtimer: Update base[CLOCK_BOOTTIME].offset correctly We calculate the current time of each clock base by adding an offset to clock_monotonic. The offset for the clock bases is set in retrigger_next_event() which is called when we switch a cpu to highres mode or when the clock was set. Add the missing update for clock boottime. Signed-off-by: Thomas Gleixner Cc: John Stultz --- kernel/hrtimer.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 4c53af18734b..f679a2d08723 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -635,6 +635,8 @@ static void retrigger_next_event(void *arg) raw_spin_lock(&base->lock); base->clock_base[HRTIMER_BASE_REALTIME].offset = timespec_to_ktime(realtime_offset); + base->clock_base[HRTIMER_BASE_BOOTTIME].offset = + timespec_to_ktime(sleep); hrtimer_force_reprogram(base, 0); raw_spin_unlock(&base->lock); From d04c579f971bf7d995db1ef7a7161c0143068859 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 3 Mar 2011 10:55:29 +0000 Subject: [PATCH 512/706] x86: Work around old gas bug Add extra parentheses around a couple of definitions introduced by "x86: Cleanup vector usage" and used in assembly macro arguments, and remove spaces. Without that old (2.16.1) gas would see more macro arguments than were actually specified. Reported-and-tested-by: Andrew Morton Signed-off-by: Jan Beulich Cc: Shaohua Li LKML-Reference: <4D6F81B10200007800034B0B@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/irq_vectors.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 4980f48bbbb7..6e976ee3b3ef 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -126,14 +126,14 @@ /* up to 32 vectors used for spreading out TLB flushes: */ #if NR_CPUS <= 32 -# define NUM_INVALIDATE_TLB_VECTORS NR_CPUS +# define NUM_INVALIDATE_TLB_VECTORS (NR_CPUS) #else -# define NUM_INVALIDATE_TLB_VECTORS 32 +# define NUM_INVALIDATE_TLB_VECTORS (32) #endif -#define INVALIDATE_TLB_VECTOR_END 0xee +#define INVALIDATE_TLB_VECTOR_END (0xee) #define INVALIDATE_TLB_VECTOR_START \ - (INVALIDATE_TLB_VECTOR_END - NUM_INVALIDATE_TLB_VECTORS + 1) + (INVALIDATE_TLB_VECTOR_END-NUM_INVALIDATE_TLB_VECTORS+1) #define NR_VECTORS 256 From 6eaa412f2753d98566b777836a98c6e7f672a3bb Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 18 Jan 2011 20:09:41 -0500 Subject: [PATCH 513/706] xen: Mark all initial reserved pages for the balloon as INVALID_P2M_ENTRY. With this patch, we diligently set regions that will be used by the balloon driver to be INVALID_P2M_ENTRY and under the ownership of the balloon driver. We are OK using the __set_phys_to_machine as we do not expect to be allocating any P2M middle or entries pages. The set_phys_to_machine has the side-effect of potentially allocating new pages and we do not want that at this stage. We can do this because xen_build_mfn_list_list will have already allocated all such pages up to xen_max_p2m_pfn. We also move the check for auto translated physmap down the stack so it is present in __set_phys_to_machine. [v2: Rebased with mmu->p2m code split] Reviewed-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/page.h | 1 + arch/x86/xen/mmu.c | 2 +- arch/x86/xen/p2m.c | 9 ++++----- arch/x86/xen/setup.c | 7 ++++++- drivers/xen/balloon.c | 2 +- 5 files changed, 13 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index f25bdf238a33..8ea977277c55 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -41,6 +41,7 @@ extern unsigned int machine_to_phys_order; extern unsigned long get_phys_to_machine(unsigned long pfn); extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); +extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); extern int m2p_add_override(unsigned long mfn, struct page *page); extern int m2p_remove_override(struct page *page); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 5e92b61ad574..0180ae88307b 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2074,7 +2074,7 @@ static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order, in_frames[i] = virt_to_mfn(vaddr); MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0); - set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); + __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); if (out_frames) out_frames[i] = virt_to_pfn(vaddr); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index ddc81a06edb9..df4e36775339 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -365,6 +365,10 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) { unsigned topidx, mididx, idx; + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { + BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); + return true; + } if (unlikely(pfn >= MAX_P2M_PFN)) { BUG_ON(mfn != INVALID_P2M_ENTRY); return true; @@ -384,11 +388,6 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) { - if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { - BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); - return true; - } - if (unlikely(!__set_phys_to_machine(pfn, mfn))) { if (!alloc_p2m(pfn)) return false; diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index b5a7f928234b..7201800e55a4 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -52,6 +52,8 @@ phys_addr_t xen_extra_mem_start, xen_extra_mem_size; static __init void xen_add_extra_mem(unsigned long pages) { + unsigned long pfn; + u64 size = (u64)pages * PAGE_SIZE; u64 extra_start = xen_extra_mem_start + xen_extra_mem_size; @@ -66,6 +68,9 @@ static __init void xen_add_extra_mem(unsigned long pages) xen_extra_mem_size += size; xen_max_p2m_pfn = PFN_DOWN(extra_start + size); + + for (pfn = PFN_DOWN(extra_start); pfn <= xen_max_p2m_pfn; pfn++) + __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); } static unsigned long __init xen_release_chunk(phys_addr_t start_addr, @@ -104,7 +109,7 @@ static unsigned long __init xen_release_chunk(phys_addr_t start_addr, WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n", start, end, ret); if (ret == 1) { - set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); len++; } } diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 43f9f02c7db0..b1661cd416b0 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -296,7 +296,7 @@ static int decrease_reservation(unsigned long nr_pages) /* No more mappings: invalidate P2M and add to balloon. */ for (i = 0; i < nr_pages; i++) { pfn = mfn_to_pfn(frame_list[i]); - set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); balloon_append(pfn_to_page(pfn)); } From 3f2a230caf21a1f7ac75f9e4892d0e5af9ccee88 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Tue, 11 Jan 2011 17:20:13 +0000 Subject: [PATCH 514/706] xen: handled remapped IRQs when enabling a pcifront PCI device. This happens to not be an issue currently because we take pains to try to ensure that the GSI-IRQ mapping is 1-1 in a PV guest and that regular event channels do not clash. However a subsequent patch is going to break this 1-1 mapping. Signed-off-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk Cc: Stefano Stabellini Cc: Jeremy Fitzhardinge --- arch/x86/pci/xen.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 25cd4a07d09f..2a12f3dbdd02 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -226,21 +226,27 @@ static int xen_pcifront_enable_irq(struct pci_dev *dev) { int rc; int share = 1; + u8 gsi; - dev_info(&dev->dev, "Xen PCI enabling IRQ: %d\n", dev->irq); - - if (dev->irq < 0) - return -EINVAL; - - if (dev->irq < NR_IRQS_LEGACY) - share = 0; - - rc = xen_allocate_pirq(dev->irq, share, "pcifront"); + rc = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi); if (rc < 0) { - dev_warn(&dev->dev, "Xen PCI IRQ: %d, failed to register:%d\n", - dev->irq, rc); + dev_warn(&dev->dev, "Xen PCI: failed to read interrupt line: %d\n", + rc); return rc; } + + if (gsi < NR_IRQS_LEGACY) + share = 0; + + rc = xen_allocate_pirq(gsi, share, "pcifront"); + if (rc < 0) { + dev_warn(&dev->dev, "Xen PCI: failed to register GSI%d: %d\n", + gsi, rc); + return rc; + } + + dev->irq = rc; + dev_info(&dev->dev, "Xen PCI mapped GSI%d to IRQ%d\n", gsi, dev->irq); return 0; } From cbf6aa89fc52c5253ee141d53eeb73147eb37ac0 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Tue, 11 Jan 2011 17:20:14 +0000 Subject: [PATCH 515/706] xen:events: move find_unbound_irq inside CONFIG_PCI_MSI The only caller is xen_allocate_pirq_msi which is also under this ifdef so this fixes: drivers/xen/events.c:377: warning: 'find_unbound_pirq' defined but not used when CONFIG_PCI_MSI=n Signed-off-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk Cc: Stefano Stabellini Cc: Jeremy Fitzhardinge --- drivers/xen/events.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 74681478100a..1ae775742325 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -387,23 +387,6 @@ static int get_nr_hw_irqs(void) return ret; } -static int find_unbound_pirq(int type) -{ - int rc, i; - struct physdev_get_free_pirq op_get_free_pirq; - op_get_free_pirq.type = type; - - rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_free_pirq, &op_get_free_pirq); - if (!rc) - return op_get_free_pirq.pirq; - - for (i = 0; i < nr_irqs; i++) { - if (pirq_to_irq[i] < 0) - return i; - } - return -1; -} - static int find_unbound_irq(void) { struct irq_data *data; @@ -677,6 +660,23 @@ out: #include #include "../pci/msi.h" +static int find_unbound_pirq(int type) +{ + int rc, i; + struct physdev_get_free_pirq op_get_free_pirq; + op_get_free_pirq.type = type; + + rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_free_pirq, &op_get_free_pirq); + if (!rc) + return op_get_free_pirq.pirq; + + for (i = 0; i < nr_irqs; i++) { + if (pirq_to_irq[i] < 0) + return i; + } + return -1; +} + void xen_allocate_pirq_msi(char *name, int *irq, int *pirq, int alloc) { spin_lock(&irq_mapping_update_lock); From c9df1ce585e3bb5a2f101c1d87381b285a9f962f Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Tue, 11 Jan 2011 17:20:15 +0000 Subject: [PATCH 516/706] xen: events: add xen_allocate_irq_{dynamic, gsi} and xen_free_irq This is neater than open-coded calls to irq_alloc_desc_at and irq_free_desc. No intended behavioural change. Note that we previously were not checking the return value of irq_alloc_desc_at which would be failing for GSI Signed-off-by: Konrad Rzeszutek Wilk Cc: Stefano Stabellini Cc: Jeremy Fitzhardinge --- drivers/xen/events.c | 53 +++++++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 1ae775742325..81a53eb6cd1d 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -387,7 +387,7 @@ static int get_nr_hw_irqs(void) return ret; } -static int find_unbound_irq(void) +static int xen_allocate_irq_dynamic(void) { struct irq_data *data; int irq, res; @@ -436,6 +436,30 @@ static bool identity_mapped_irq(unsigned irq) return irq < get_nr_hw_irqs(); } +static int xen_allocate_irq_gsi(unsigned gsi) +{ + int irq; + + if (!identity_mapped_irq(gsi) && + (xen_initial_domain() || !xen_pv_domain())) + return xen_allocate_irq_dynamic(); + + /* Legacy IRQ descriptors are already allocated by the arch. */ + if (gsi < NR_IRQS_LEGACY) + return gsi; + + irq = irq_alloc_desc_at(gsi, -1); + if (irq < 0) + panic("Unable to allocate to IRQ%d (%d)\n", gsi, irq); + + return irq; +} + +static void xen_free_irq(unsigned irq) +{ + irq_free_desc(irq); +} + static void pirq_unmask_notify(int irq) { struct physdev_eoi eoi = { .irq = pirq_from_irq(irq) }; @@ -621,14 +645,7 @@ int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name) goto out; /* XXX need refcount? */ } - /* If we are a PV guest, we don't have GSIs (no ACPI passed). Therefore - * we are using the !xen_initial_domain() to drop in the function.*/ - if (identity_mapped_irq(gsi) || (!xen_initial_domain() && - xen_pv_domain())) { - irq = gsi; - irq_alloc_desc_at(irq, -1); - } else - irq = find_unbound_irq(); + irq = xen_allocate_irq_gsi(gsi); set_irq_chip_and_handler_name(irq, &xen_pirq_chip, handle_level_irq, name); @@ -641,7 +658,7 @@ int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name) * this in the priv domain. */ if (xen_initial_domain() && HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) { - irq_free_desc(irq); + xen_free_irq(irq); irq = -ENOSPC; goto out; } @@ -682,7 +699,7 @@ void xen_allocate_pirq_msi(char *name, int *irq, int *pirq, int alloc) spin_lock(&irq_mapping_update_lock); if (alloc & XEN_ALLOC_IRQ) { - *irq = find_unbound_irq(); + *irq = xen_allocate_irq_dynamic(); if (*irq == -1) goto out; } @@ -732,7 +749,7 @@ int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type) spin_lock(&irq_mapping_update_lock); - irq = find_unbound_irq(); + irq = xen_allocate_irq_dynamic(); if (irq == -1) goto out; @@ -741,7 +758,7 @@ int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type) if (rc) { printk(KERN_WARNING "xen map irq failed %d\n", rc); - irq_free_desc(irq); + xen_free_irq(irq); irq = -1; goto out; @@ -783,7 +800,7 @@ int xen_destroy_irq(int irq) } irq_info[irq] = mk_unbound_info(); - irq_free_desc(irq); + xen_free_irq(irq); out: spin_unlock(&irq_mapping_update_lock); @@ -814,7 +831,7 @@ int bind_evtchn_to_irq(unsigned int evtchn) irq = evtchn_to_irq[evtchn]; if (irq == -1) { - irq = find_unbound_irq(); + irq = xen_allocate_irq_dynamic(); set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, handle_fasteoi_irq, "event"); @@ -839,7 +856,7 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) irq = per_cpu(ipi_to_irq, cpu)[ipi]; if (irq == -1) { - irq = find_unbound_irq(); + irq = xen_allocate_irq_dynamic(); if (irq < 0) goto out; @@ -875,7 +892,7 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu) irq = per_cpu(virq_to_irq, cpu)[virq]; if (irq == -1) { - irq = find_unbound_irq(); + irq = xen_allocate_irq_dynamic(); set_irq_chip_and_handler_name(irq, &xen_percpu_chip, handle_percpu_irq, "virq"); @@ -934,7 +951,7 @@ static void unbind_from_irq(unsigned int irq) if (irq_info[irq].type != IRQT_UNBOUND) { irq_info[irq] = mk_unbound_info(); - irq_free_desc(irq); + xen_free_irq(irq); } spin_unlock(&irq_mapping_update_lock); From 89911501f3aae44a43984793341a3bf1f4c583c2 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Thu, 3 Mar 2011 11:57:44 -0500 Subject: [PATCH 517/706] xen: events: allocate GSIs and dynamic IRQs from separate IRQ ranges. There are three cases which we need to care about, PV guest, PV domain 0 and HVM guest. The PV guest case is simple since it has no access to ACPI or real APICs and therefore has no GSIs therefore we simply dynamically allocate all IRQs. The potentially interesting case here is PIRQ type event channels associated with passed through PCI devices. However even in this case the guest has no direct interaction with the physical GSI since that happens in the PCI backend. The PV domain 0 and HVM guest cases are actually the same. In domain 0 case the kernel sees the host ACPI and GSIs (although it only sees the APIC indirectly via the hypervisor) and in the HVM guest case it sees the virtualised ACPI and emulated APICs. In these cases we start allocating dynamic IRQs at nr_irqs_gsi so that they cannot clash with any GSI. Currently xen_allocate_irq_dynamic starts at nr_irqs and works backwards looking for a free IRQ in order to (try and) avoid clashing with GSIs used in domain 0 and in HVM guests. This change avoids that although we retain the behaviour of allowing dynamic IRQs to encroach on the GSI range if no suitable IRQs are available since a future IRQ clash is deemed preferable to failure right now. Signed-off-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk Cc: Stefano Stabellini Cc: Jeremy Fitzhardinge --- drivers/xen/events.c | 81 ++++++++++++++++---------------------------- 1 file changed, 29 insertions(+), 52 deletions(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 81a53eb6cd1d..06f2e61de691 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -376,72 +376,49 @@ static void unmask_evtchn(int port) put_cpu(); } -static int get_nr_hw_irqs(void) -{ - int ret = 1; - -#ifdef CONFIG_X86_IO_APIC - ret = get_nr_irqs_gsi(); -#endif - - return ret; -} - static int xen_allocate_irq_dynamic(void) { - struct irq_data *data; - int irq, res; - int bottom = get_nr_hw_irqs(); - int top = nr_irqs-1; + int first = 0; + int irq; - if (bottom == nr_irqs) - goto no_irqs; - - /* This loop starts from the top of IRQ space and goes down. - * We need this b/c if we have a PCI device in a Xen PV guest - * we do not have an IO-APIC (though the backend might have them) - * mapped in. To not have a collision of physical IRQs with the Xen - * event channels start at the top of the IRQ space for virtual IRQs. +#ifdef CONFIG_X86_IO_APIC + /* + * For an HVM guest or domain 0 which see "real" (emulated or + * actual repectively) GSIs we allocate dynamic IRQs + * e.g. those corresponding to event channels or MSIs + * etc. from the range above those "real" GSIs to avoid + * collisions. */ - for (irq = top; irq > bottom; irq--) { - data = irq_get_irq_data(irq); - /* only 15->0 have init'd desc; handle irq > 16 */ - if (!data) - break; - if (data->chip == &no_irq_chip) - break; - if (data->chip != &xen_dynamic_chip) - continue; - if (irq_info[irq].type == IRQT_UNBOUND) - return irq; + if (xen_initial_domain() || xen_hvm_domain()) + first = get_nr_irqs_gsi(); +#endif + +retry: + irq = irq_alloc_desc_from(first, -1); + + if (irq == -ENOMEM && first > NR_IRQS_LEGACY) { + printk(KERN_ERR "Out of dynamic IRQ space and eating into GSI space. You should increase nr_irqs\n"); + first = max(NR_IRQS_LEGACY, first - NR_IRQS_LEGACY); + goto retry; } - if (irq == bottom) - goto no_irqs; - - res = irq_alloc_desc_at(irq, -1); - - if (WARN_ON(res != irq)) - return -1; + if (irq < 0) + panic("No available IRQ to bind to: increase nr_irqs!\n"); return irq; - -no_irqs: - panic("No available IRQ to bind to: increase nr_irqs!\n"); -} - -static bool identity_mapped_irq(unsigned irq) -{ - /* identity map all the hardware irqs */ - return irq < get_nr_hw_irqs(); } static int xen_allocate_irq_gsi(unsigned gsi) { int irq; - if (!identity_mapped_irq(gsi) && - (xen_initial_domain() || !xen_pv_domain())) + /* + * A PV guest has no concept of a GSI (since it has no ACPI + * nor access to/knowledge of the physical APICs). Therefore + * all IRQs are dynamically allocated from the entire IRQ + * space. + */ + if (xen_pv_domain() && !xen_initial_domain()) return xen_allocate_irq_dynamic(); /* Legacy IRQ descriptors are already allocated by the arch. */ From 7214610475b2847a81478d96e4d3ba0bbe49598c Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Thu, 3 Feb 2011 09:49:35 +0000 Subject: [PATCH 518/706] xen: events: do not free legacy IRQs c514d00c8057 "xen: events: add xen_allocate_irq_{dynamic, gsi} and xen_free_irq" correctly avoids reallocating legacy IRQs (which are managed by the arch core) but erroneously did not prevent them being freed. Signed-off-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/events.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 06f2e61de691..accb37ad0944 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -434,6 +434,10 @@ static int xen_allocate_irq_gsi(unsigned gsi) static void xen_free_irq(unsigned irq) { + /* Legacy IRQ descriptors are managed by the arch. */ + if (irq < NR_IRQS_LEGACY) + return; + irq_free_desc(irq); } From 149f256f8ca690c28dd8aa9fb8bcdaf2e93b1e1c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 5 Feb 2011 20:08:52 +0000 Subject: [PATCH 519/706] xen: Remove stale irq_chip.end irq_chip.end got obsolete with the removal of __do_IRQ() Signed-off-by: Thomas Gleixner Acked-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/events.c | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index accb37ad0944..c8826b5142c4 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -555,23 +555,6 @@ static void ack_pirq(unsigned int irq) } } -static void end_pirq(unsigned int irq) -{ - int evtchn = evtchn_from_irq(irq); - struct irq_desc *desc = irq_to_desc(irq); - - if (WARN_ON(!desc)) - return; - - if ((desc->status & (IRQ_DISABLED|IRQ_PENDING)) == - (IRQ_DISABLED|IRQ_PENDING)) { - shutdown_pirq(irq); - } else if (VALID_EVTCHN(evtchn)) { - unmask_evtchn(evtchn); - pirq_unmask_notify(irq); - } -} - static int find_irq_by_gsi(unsigned gsi) { int irq; @@ -1508,7 +1491,6 @@ static struct irq_chip xen_pirq_chip __read_mostly = { .mask = disable_pirq, .ack = ack_pirq, - .end = end_pirq, .set_affinity = set_affinity_irq, From c9e265e030537167c94cbed190826f02e3887f4d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 5 Feb 2011 20:08:54 +0000 Subject: [PATCH 520/706] xen: Switch to new irq_chip functions Convert Xen to the new irq_chip functions. Brings us closer to enable CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED Signed-off-by: Thomas Gleixner Acked-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/events.c | 95 ++++++++++++++++++++++++-------------------- 1 file changed, 51 insertions(+), 44 deletions(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index c8826b5142c4..cf1712fb1c46 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -277,7 +277,7 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) BUG_ON(irq == -1); #ifdef CONFIG_SMP - cpumask_copy(irq_to_desc(irq)->affinity, cpumask_of(cpu)); + cpumask_copy(irq_to_desc(irq)->irq_data.affinity, cpumask_of(cpu)); #endif clear_bit(chn, cpu_evtchn_mask(cpu_from_irq(irq))); @@ -294,7 +294,7 @@ static void init_evtchn_cpu_bindings(void) /* By default all event channels notify CPU#0. */ for_each_irq_desc(i, desc) { - cpumask_copy(desc->affinity, cpumask_of(0)); + cpumask_copy(desc->irq_data.affinity, cpumask_of(0)); } #endif @@ -474,7 +474,7 @@ static bool probing_irq(int irq) return desc && desc->action == NULL; } -static unsigned int startup_pirq(unsigned int irq) +static unsigned int __startup_pirq(unsigned int irq) { struct evtchn_bind_pirq bind_pirq; struct irq_info *info = info_for_irq(irq); @@ -512,9 +512,15 @@ out: return 0; } -static void shutdown_pirq(unsigned int irq) +static unsigned int startup_pirq(struct irq_data *data) +{ + return __startup_pirq(data->irq); +} + +static void shutdown_pirq(struct irq_data *data) { struct evtchn_close close; + unsigned int irq = data->irq; struct irq_info *info = info_for_irq(irq); int evtchn = evtchn_from_irq(irq); @@ -534,20 +540,20 @@ static void shutdown_pirq(unsigned int irq) info->evtchn = 0; } -static void enable_pirq(unsigned int irq) +static void enable_pirq(struct irq_data *data) { - startup_pirq(irq); + startup_pirq(data); } -static void disable_pirq(unsigned int irq) +static void disable_pirq(struct irq_data *data) { } -static void ack_pirq(unsigned int irq) +static void ack_pirq(struct irq_data *data) { - int evtchn = evtchn_from_irq(irq); + int evtchn = evtchn_from_irq(data->irq); - move_native_irq(irq); + irq_move_irq(data); if (VALID_EVTCHN(evtchn)) { mask_evtchn(evtchn); @@ -1215,11 +1221,12 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) return 0; } -static int set_affinity_irq(unsigned irq, const struct cpumask *dest) +static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest, + bool force) { unsigned tcpu = cpumask_first(dest); - return rebind_irq_to_cpu(irq, tcpu); + return rebind_irq_to_cpu(data->irq, tcpu); } int resend_irq_on_evtchn(unsigned int irq) @@ -1238,35 +1245,35 @@ int resend_irq_on_evtchn(unsigned int irq) return 1; } -static void enable_dynirq(unsigned int irq) +static void enable_dynirq(struct irq_data *data) { - int evtchn = evtchn_from_irq(irq); + int evtchn = evtchn_from_irq(data->irq); if (VALID_EVTCHN(evtchn)) unmask_evtchn(evtchn); } -static void disable_dynirq(unsigned int irq) +static void disable_dynirq(struct irq_data *data) { - int evtchn = evtchn_from_irq(irq); + int evtchn = evtchn_from_irq(data->irq); if (VALID_EVTCHN(evtchn)) mask_evtchn(evtchn); } -static void ack_dynirq(unsigned int irq) +static void ack_dynirq(struct irq_data *data) { - int evtchn = evtchn_from_irq(irq); + int evtchn = evtchn_from_irq(data->irq); - move_masked_irq(irq); + move_masked_irq(data->irq); if (VALID_EVTCHN(evtchn)) unmask_evtchn(evtchn); } -static int retrigger_dynirq(unsigned int irq) +static int retrigger_dynirq(struct irq_data *data) { - int evtchn = evtchn_from_irq(irq); + int evtchn = evtchn_from_irq(data->irq); struct shared_info *sh = HYPERVISOR_shared_info; int ret = 0; @@ -1315,7 +1322,7 @@ static void restore_cpu_pirqs(void) printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq); - startup_pirq(irq); + __startup_pirq(irq); } } @@ -1467,44 +1474,44 @@ void xen_irq_resume(void) } static struct irq_chip xen_dynamic_chip __read_mostly = { - .name = "xen-dyn", + .name = "xen-dyn", - .disable = disable_dynirq, - .mask = disable_dynirq, - .unmask = enable_dynirq, + .irq_disable = disable_dynirq, + .irq_mask = disable_dynirq, + .irq_unmask = enable_dynirq, - .eoi = ack_dynirq, - .set_affinity = set_affinity_irq, - .retrigger = retrigger_dynirq, + .irq_eoi = ack_dynirq, + .irq_set_affinity = set_affinity_irq, + .irq_retrigger = retrigger_dynirq, }; static struct irq_chip xen_pirq_chip __read_mostly = { - .name = "xen-pirq", + .name = "xen-pirq", - .startup = startup_pirq, - .shutdown = shutdown_pirq, + .irq_startup = startup_pirq, + .irq_shutdown = shutdown_pirq, - .enable = enable_pirq, - .unmask = enable_pirq, + .irq_enable = enable_pirq, + .irq_unmask = enable_pirq, - .disable = disable_pirq, - .mask = disable_pirq, + .irq_disable = disable_pirq, + .irq_mask = disable_pirq, - .ack = ack_pirq, + .irq_ack = ack_pirq, - .set_affinity = set_affinity_irq, + .irq_set_affinity = set_affinity_irq, - .retrigger = retrigger_dynirq, + .irq_retrigger = retrigger_dynirq, }; static struct irq_chip xen_percpu_chip __read_mostly = { - .name = "xen-percpu", + .name = "xen-percpu", - .disable = disable_dynirq, - .mask = disable_dynirq, - .unmask = enable_dynirq, + .irq_disable = disable_dynirq, + .irq_mask = disable_dynirq, + .irq_unmask = enable_dynirq, - .ack = ack_dynirq, + .irq_ack = ack_dynirq, }; int xen_set_callback_via(uint64_t via) From aa673c1cb3a66d0b37595251c4e8bb688efc8726 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Mon, 7 Feb 2011 11:08:39 +0000 Subject: [PATCH 521/706] xen: Fix compile error introduced by "switch to new irq_chip functions" drivers/xen/events.c: In function 'ack_pirq': drivers/xen/events.c:568: error: implicit declaration of function 'irq_move_irq' Signed-off-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index cf1712fb1c46..5aa422a3c3cd 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -553,7 +553,7 @@ static void ack_pirq(struct irq_data *data) { int evtchn = evtchn_from_irq(data->irq); - irq_move_irq(data); + move_native_irq(data->irq); if (VALID_EVTCHN(evtchn)) { mask_evtchn(evtchn); From f611f2da99420abc973c32cdbddbf5c365d0a20c Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Tue, 8 Feb 2011 14:03:31 +0000 Subject: [PATCH 522/706] xen/timer: Missing IRQF_NO_SUSPEND in timer code broke suspend. The patches missed an indirect use of IRQF_NO_SUSPEND pulled in via IRQF_TIMER. The following patch fixes the issue. With this fixlet PV guest migration works just fine. I also booted the entire series as a dom0 kernel and it appeared fine. Signed-off-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/time.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 067759e3d6a5..2e2d370a47b1 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -397,7 +397,9 @@ void xen_setup_timer(int cpu) name = ""; irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, - IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER, + IRQF_DISABLED|IRQF_PERCPU| + IRQF_NOBALANCING|IRQF_TIMER| + IRQF_FORCE_RESUME, name, NULL); evt = &per_cpu(xen_clock_events, cpu); From 676dc3cf5bc36a9e129a3ad8fe3bd7b2ebf20f5d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 5 Feb 2011 20:08:59 +0000 Subject: [PATCH 523/706] xen: Use IRQF_FORCE_RESUME Mark the IRQF_NO_SUSPEND interrupts IRQF_FORCE_RESUME and remove the extra walk through the interrupt descriptors. Signed-off-by: Thomas Gleixner Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/events.c | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 5aa422a3c3cd..975e90fa6d5a 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -977,7 +977,7 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi, if (irq < 0) return irq; - irqflags |= IRQF_NO_SUSPEND; + irqflags |= IRQF_NO_SUSPEND | IRQF_FORCE_RESUME; retval = request_irq(irq, handler, irqflags, devname, dev_id); if (retval != 0) { unbind_from_irq(irq); @@ -1433,7 +1433,6 @@ void xen_poll_irq(int irq) void xen_irq_resume(void) { unsigned int cpu, irq, evtchn; - struct irq_desc *desc; init_evtchn_cpu_bindings(); @@ -1453,23 +1452,6 @@ void xen_irq_resume(void) restore_cpu_ipis(cpu); } - /* - * Unmask any IRQF_NO_SUSPEND IRQs which are enabled. These - * are not handled by the IRQ core. - */ - for_each_irq_desc(irq, desc) { - if (!desc->action || !(desc->action->flags & IRQF_NO_SUSPEND)) - continue; - if (desc->status & IRQ_DISABLED) - continue; - - evtchn = evtchn_from_irq(irq); - if (evtchn == -1) - continue; - - unmask_evtchn(evtchn); - } - restore_cpu_pirqs(); } From 1aa0b51a033d4a1ec6d29d06487e053398afa21b Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 17 Feb 2011 11:23:58 -0500 Subject: [PATCH 524/706] xen/irq: Cleanup up the pirq_to_irq for DomU PV PCI passthrough guests as well. We only did this for PV guests that are xen_initial_domain() but there is not reason not to do this for other cases. The other case is only exercised when you pass in a PCI device to a PV guest _and_ the device in question. Reviewed-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/events.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 975e90fa6d5a..89987a7bf26f 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -766,8 +766,9 @@ int xen_destroy_irq(int irq) printk(KERN_WARNING "unmap irq failed %d\n", rc); goto out; } - pirq_to_irq[info->u.pirq.pirq] = -1; } + pirq_to_irq[info->u.pirq.pirq] = -1; + irq_info[irq] = mk_unbound_info(); xen_free_irq(irq); From ff9ae1babd8ce88c3f90db6278ea5f55bdcb4624 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 25 Feb 2011 21:57:04 +0100 Subject: [PATCH 525/706] perf: Fix missing strndup declaration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit is included first without _GNU_SOURCE, so it ends up including without declaring strndup(). And further declarations, even with _GNU_SOURCE defined, are of course without effect. Therefore: util/strfilter.c: Dans la fonction «strfilter_node__new» : util/strfilter.c:134: attention : déclaration implicite de la fonction « «strndup» » util/strfilter.c:134: attention : incompatible implicit declaration of built-in function «strndup» make: *** [util/strfilter.o] Erreur 1 Just don't include ctype.h as it doesn't appear to be necessary anyway. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi Cc: Arnaldo Carvalho de Melo --- tools/perf/util/strfilter.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/perf/util/strfilter.c b/tools/perf/util/strfilter.c index 4064b7d708b8..834c8ebfe38e 100644 --- a/tools/perf/util/strfilter.c +++ b/tools/perf/util/strfilter.c @@ -1,4 +1,3 @@ -#include #include "util.h" #include "string.h" #include "strfilter.h" From cfff2d909cbdaf8c467bd321aa0502a548ec8f7e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 25 Feb 2011 21:30:16 +0100 Subject: [PATCH 526/706] perf: Fix undefined PyVarObject_HEAD_INIT in python 2.5 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PyVarObject_HEAD_INIT is undefined in python 2.5, resulting in a build crash: util/python.c:81: attention : déclaration implicite de la fonction « «PyVarObject_HEAD_INIT» » util/python.c:82: erreur: request for member «tp_name» in something not a structure or union util/python.c:117: erreur: request for member «tp_name» in something not a structure or union util/python.c:146: erreur: request for member «tp_name» in something not a structure or union util/python.c:177: erreur: request for member «tp_name» in something not a structure or union util/python.c:290: erreur: request for member «tp_name» in something not a structure or union util/python.c:359: erreur: request for member «tp_name» in something not a structure or union util/python.c:532: erreur: request for member «tp_name» in something not a structure or union util/python.c:761: erreur: request for member «tp_name» in something not a structure or union error: command 'gcc' failed with exit status 1 make: *** [python/perf.so] Erreur 1 We can fix that by defining PyVarObject_HEAD_INIT as a wrapper on PyObject_HEAD_INIT, thanks to a trick found on biopython: https://github.com/biopython/biopython/commit/d4eaf57946c7b4c32eca8d18821edf32f83e300d Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi Cc: Arnaldo Carvalho de Melo --- tools/perf/util/python.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 5317ef229041..a9f2d7e1204d 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -8,6 +8,11 @@ #include "cpumap.h" #include "thread_map.h" +/* Define PyVarObject_HEAD_INIT for python 2.5 */ +#ifndef PyVarObject_HEAD_INIT +# define PyVarObject_HEAD_INIT(type, size) PyObject_HEAD_INIT(type) size, +#endif + struct throttle_event { struct perf_event_header header; u64 time; From f89112502805c1f6a6955f90ad158e538edb319d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 4 Mar 2011 10:26:36 +0100 Subject: [PATCH 527/706] x86-64, NUMA: Revert NUMA affine page table allocation This patch reverts NUMA affine page table allocation added by commit 1411e0ec31 (x86-64, numa: Put pgtable to local node memory). The commit made an undocumented change where the kernel linear mapping strictly follows intersection of e820 memory map and NUMA configuration. If the physical memory configuration has holes or NUMA nodes are not properly aligned, this leads to using unnecessarily smaller mapping size which leads to increased TLB pressure. For details, http://thread.gmane.org/gmane.linux.kernel/1104672 Patches to fix the problem have been proposed but the underlying code needs more cleanup and the approach itself seems a bit heavy handed and it has been determined to revert the feature for now and come back to it in the next developement cycle. http://thread.gmane.org/gmane.linux.kernel/1105959 As init_memory_mapping_high() callsites have been consolidated since the commit, reverting is done manually. Also, the RED-PEN comment in arch/x86/mm/init.c is not restored as the problem no longer exists with memblock based top-down early memory allocation. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner --- arch/x86/include/asm/page_types.h | 2 -- arch/x86/kernel/setup.c | 8 +++++ arch/x86/mm/init_64.c | 54 ------------------------------- arch/x86/mm/numa_64.c | 2 -- 4 files changed, 8 insertions(+), 58 deletions(-) diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 97e6007e4edd..bce688d54c12 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -54,8 +54,6 @@ static inline phys_addr_t get_max_mapped(void) extern unsigned long init_memory_mapping(unsigned long start, unsigned long end); -void init_memory_mapping_high(void); - extern void initmem_init(void); extern void free_initmem(void); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 46e684f85b36..c3a606c41ce0 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -963,6 +963,14 @@ void __init setup_arch(char **cmdline_p) max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< max_low_pfn) { + max_pfn_mapped = init_memory_mapping(1UL<<32, + max_pfn<start); - final_end = min_t(unsigned long, end_pfn<end); - - if (final_end <= final_start) - return 0; - - pfn_mapped = init_memory_mapping(final_start, final_end); - - if (pfn_mapped > data->pfn_mapped) - data->pfn_mapped = pfn_mapped; - - return 0; -} - -static unsigned long __init_refok -init_memory_mapping_active_regions(unsigned long start, unsigned long end) -{ - struct mapping_work_data data; - - data.start = start; - data.end = end; - data.pfn_mapped = 0; - - work_with_active_regions(MAX_NUMNODES, mapping_work_fn, &data); - - return data.pfn_mapped; -} - -void __init_refok init_memory_mapping_high(void) -{ - if (max_pfn > max_low_pfn) { - max_pfn_mapped = init_memory_mapping_active_regions(1UL<<32, - max_pfn< Date: Tue, 22 Feb 2011 13:04:33 -0800 Subject: [PATCH 528/706] sched: Allow SCHED_BATCH to preempt SCHED_IDLE tasks Perform the test for SCHED_IDLE before testing for SCHED_BATCH (and ensure idle tasks don't preempt idle tasks) so the non-interactive, but still important, SCHED_BATCH tasks will run in favor of the very low priority SCHED_IDLE tasks. Signed-off-by: Darren Hart Signed-off-by: Peter Zijlstra Acked-by: Mike Galbraith Cc: Richard Purdie LKML-Reference: <1298408674-3130-2-git-send-email-dvhart@linux.intel.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3a88dee165c0..1438e13cf8be 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1870,16 +1870,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (test_tsk_need_resched(curr)) return; + /* Idle tasks are by definition preempted by non-idle tasks. */ + if (unlikely(curr->policy == SCHED_IDLE) && + likely(p->policy != SCHED_IDLE)) + goto preempt; + /* - * Batch and idle tasks do not preempt (their preemption is driven by - * the tick): + * Batch and idle tasks do not preempt non-idle tasks (their preemption + * is driven by the tick): */ if (unlikely(p->policy != SCHED_NORMAL)) return; - /* Idle tasks are by definition preempted by everybody. */ - if (unlikely(curr->policy == SCHED_IDLE)) - goto preempt; if (!sched_feat(WAKEUP_PREEMPT)) return; From c02aa73b1d18e43cfd79c2f193b225e84ca497c8 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Thu, 17 Feb 2011 15:37:07 -0800 Subject: [PATCH 529/706] sched: Allow users with sufficient RLIMIT_NICE to change from SCHED_IDLE policy The current scheduler implementation returns -EPERM when trying to change from SCHED_IDLE to SCHED_OTHER or SCHED_BATCH. Since SCHED_IDLE is considered to be a nice 20 on steroids, changing to another policy should be allowed provided the RLIMIT_NICE is accounted for. This patch allows the following test-case to pass with RLIMIT_NICE=40, but still fail with RLIMIT_NICE=10 when the calling process is run from a typical shell (nice 0, or 20 in rlimit terms). int main() { int ret; struct sched_param sp; sp.sched_priority = 0; /* switch to SCHED_IDLE */ ret = sched_setscheduler(0, SCHED_IDLE, &sp); printf("setscheduler IDLE: %d\n", ret); if (ret) return ret; /* switch back to SCHED_OTHER */ ret = sched_setscheduler(0, SCHED_OTHER, &sp); printf("setscheduler OTHER: %d\n", ret); return ret; } $ ulimit -e 40 $ ./test setscheduler IDLE: 0 setscheduler OTHER: 0 $ ulimit -e 10 $ ulimit -e 10 $ ./test setscheduler IDLE: 0 setscheduler OTHER: -1 Signed-off-by: Darren Hart Signed-off-by: Peter Zijlstra Cc: Richard Purdie LKML-Reference: <4D657BEE.4040608@linux.intel.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 0c8712630f05..f3030709d826 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4981,12 +4981,15 @@ recheck: param->sched_priority > rlim_rtprio) return -EPERM; } + /* - * Like positive nice levels, dont allow tasks to - * move out of SCHED_IDLE either: + * Treat SCHED_IDLE as nice 20. Only allow a switch to + * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. */ - if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) - return -EPERM; + if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { + if (!can_nice(p, TASK_NICE(p))) + return -EPERM; + } /* can't change other user's priorities */ if (!check_same_owner(p)) From 6d1cafd8b56ea726c10a5a104de57cc3ed8fa953 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Tue, 1 Mar 2011 16:28:21 -0800 Subject: [PATCH 530/706] sched: Resched proper CPU on yield_to() yield_to_task_fair() has code to resched the CPU of yielding task when the intention is to resched the CPU of the task that is being yielded to. Change here fixes the problem and also makes the resched conditional on rq != p_rq. Signed-off-by: Venkatesh Pallipadi Reviewed-by: Rik van Riel Signed-off-by: Peter Zijlstra LKML-Reference: <1299025701-22168-1-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 9 ++++++++- kernel/sched_fair.c | 4 ---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index f3030709d826..61452e86c73b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5522,8 +5522,15 @@ again: goto out; yielded = curr->sched_class->yield_to_task(rq, p, preempt); - if (yielded) + if (yielded) { schedstat_inc(rq, yld_count); + /* + * Make p's CPU reschedule; pick_next_entity takes care of + * fairness. + */ + if (preempt && rq != p_rq) + resched_task(p_rq->curr); + } out: double_rq_unlock(rq, p_rq); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 1438e13cf8be..3f7ec9e27ee1 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1987,10 +1987,6 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp /* Tell the scheduler that we'd really like pse to run next. */ set_next_buddy(se); - /* Make p's CPU reschedule; pick_next_entity takes care of fairness. */ - if (preempt) - resched_task(rq->curr); - yield_task_fair(rq); return true; From 940c5b2971de443df22eed0441bc74fb0116e9f5 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Sun, 27 Feb 2011 21:13:31 +0800 Subject: [PATCH 531/706] perf: Fix the missing event initialization when pmu is found in idr Currently, the event is not initialized if pmu is found in idr. This never causes bug just because now no pmu is associated with the idr id. Signed-off-by: Lin Ming Signed-off-by: Peter Zijlstra LKML-Reference: <1298812411.2699.9.camel@localhost> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 64a018e94fca..821ce8221974 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -6098,17 +6098,22 @@ struct pmu *perf_init_event(struct perf_event *event) { struct pmu *pmu = NULL; int idx; + int ret; idx = srcu_read_lock(&pmus_srcu); rcu_read_lock(); pmu = idr_find(&pmu_idr, event->attr.type); rcu_read_unlock(); - if (pmu) + if (pmu) { + ret = pmu->event_init(event); + if (ret) + pmu = ERR_PTR(ret); goto unlock; + } list_for_each_entry_rcu(pmu, &pmus, entry) { - int ret = pmu->event_init(event); + ret = pmu->event_init(event); if (!ret) goto unlock; From 3db272c0494900fcb905a201180a78cae3addd6e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 3 Mar 2011 14:25:37 +0800 Subject: [PATCH 532/706] perf cgroup: Fix leak of file reference count In perf_cgroup_connect(), fput_light() is missing in a failure path. Signed-off-by: Li Zefan Acked-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <4D6F3461.6060406@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 821ce8221974..7c999e809ba3 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -404,8 +404,10 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, return -EBADF; css = cgroup_css_from_dir(file, perf_subsys_id); - if (IS_ERR(css)) - return PTR_ERR(css); + if (IS_ERR(css)) { + ret = PTR_ERR(css); + goto out; + } cgrp = container_of(css, struct perf_cgroup, css); event->cgrp = cgrp; @@ -422,6 +424,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, /* must be done before we fput() the file */ perf_get_cgroup(event); } +out: fput_light(file, fput_needed); return ret; } From f75e18cb9627b1d3d752b83a0b5563da0042c50a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 3 Mar 2011 14:25:50 +0800 Subject: [PATCH 533/706] perf cgroup: Fix unmatched call to perf_detach_cgroup() In the failure path, we call perf_detach_cgroup(), but we didn't call perf_get_cgroup() prio to it. Signed-off-by: Li Zefan Acked-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <4D6F346E.9070606@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 7c999e809ba3..b00209544d57 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -412,6 +412,9 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, cgrp = container_of(css, struct perf_cgroup, css); event->cgrp = cgrp; + /* must be done before we fput() the file */ + perf_get_cgroup(event); + /* * all events in a group must monitor * the same cgroup because a task belongs @@ -420,9 +423,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, if (group_leader && group_leader->cgrp != cgrp) { perf_detach_cgroup(event); ret = -EINVAL; - } else { - /* must be done before we fput() the file */ - perf_get_cgroup(event); } out: fput_light(file, fput_needed); From 1b15d0558e82df9b3659804ceb44187b98eda354 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 3 Mar 2011 14:26:06 +0800 Subject: [PATCH 534/706] perf cgroup: Clean up perf_cgroup_create() - Use kzalloc() to replace kmalloc() + memset(). - Remove redundant initialization, since alloc_percpu() returns zero-filled percpu memory. Signed-off-by: Li Zefan Acked-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <4D6F347E.2010806@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/kernel/perf_event.c b/kernel/perf_event.c index b00209544d57..193b1900e64f 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -7346,26 +7346,17 @@ static struct cgroup_subsys_state *perf_cgroup_create( struct cgroup_subsys *ss, struct cgroup *cont) { struct perf_cgroup *jc; - struct perf_cgroup_info *t; - int c; - jc = kmalloc(sizeof(*jc), GFP_KERNEL); + jc = kzalloc(sizeof(*jc), GFP_KERNEL); if (!jc) return ERR_PTR(-ENOMEM); - memset(jc, 0, sizeof(*jc)); - jc->info = alloc_percpu(struct perf_cgroup_info); if (!jc->info) { kfree(jc); return ERR_PTR(-ENOMEM); } - for_each_possible_cpu(c) { - t = per_cpu_ptr(jc->info, c); - t->time = 0; - t->timestamp = 0; - } return &jc->css; } From 2d0f25201ee210a0666ec9c41538ba05a07f8bc6 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 3 Mar 2011 14:26:20 +0800 Subject: [PATCH 535/706] perf cgroup: Fix a typo in kernel config s/specificied/specified Signed-off-by: Li Zefan Acked-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <4D6F348C.2050804@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- init/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/Kconfig b/init/Kconfig index 20d6bd919b8d..4c4edf2ec4a9 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -688,7 +688,7 @@ config CGROUP_PERF depends on PERF_EVENTS && CGROUPS help This option extends the per-cpu mode to restrict monitoring to - threads which belong to the cgroup specificied and run on the + threads which belong to the cgroup specified and run on the designated cpu. Say N if unsure. From 08309379b7083a9ceec0f9bb96a629058fb623c4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 3 Mar 2011 11:31:20 +0100 Subject: [PATCH 536/706] perf: Fix cgroup vs jump_label problem Li Zefan reported that the jump label code sleeps and we're calling it under a spinlock, *fail* ;-) Reported-by: Li Zefan Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 193b1900e64f..ed253aa24ba4 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -820,16 +820,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) list_add_tail(&event->group_entry, list); } - if (is_cgroup_event(event)) { + if (is_cgroup_event(event)) ctx->nr_cgroups++; - /* - * one more event: - * - that has cgroup constraint on event->cpu - * - that may need work on context switch - */ - atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_inc(&perf_sched_events); - } list_add_rcu(&event->event_entry, &ctx->event_list); if (!ctx->nr_events) @@ -957,11 +949,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) event->attach_state &= ~PERF_ATTACH_CONTEXT; - if (is_cgroup_event(event)) { + if (is_cgroup_event(event)) ctx->nr_cgroups--; - atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_dec(&perf_sched_events); - } ctx->nr_events--; if (event->attr.inherit_stat) @@ -2903,6 +2892,10 @@ static void free_event(struct perf_event *event) atomic_dec(&nr_task_events); if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) put_callchain_buffers(); + if (is_cgroup_event(event)) { + atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); + jump_label_dec(&perf_sched_events); + } } if (event->buffer) { @@ -6478,6 +6471,13 @@ SYSCALL_DEFINE5(perf_event_open, err = perf_cgroup_connect(pid, event, &attr, group_leader); if (err) goto err_alloc; + /* + * one more event: + * - that has cgroup constraint on event->cpu + * - that may need work on context switch + */ + atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); + jump_label_inc(&perf_sched_events); } /* From 17e3162972cbb9796035fff1e2fd30669b0eef65 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 2 Mar 2011 17:05:01 +0200 Subject: [PATCH 537/706] perf_events: Update PEBS event constraints This patch updates PEBS event constraints for Intel Atom, Nehalem, Westmere. This patch also reorganizes the PEBS format/constraint detection code. It is now based on processor model and not PEBS format. Two processors may use the same PEBS format without have the same list of PEBS events. In this second version, we simplified the initialization of the PEBS constraints by leveraging the existing switch() statement in perf_event_intel.c. We also renamed the constraint tables to be more consistent with regular constraints. In this 3rd version, we drop BR_INST_RETIRED.MISPRED from Intel Atom as it does not seem to work. Use MISPREDICTED_BRANCH_RETIRED instead. Also add FP_ASSIST.* o both Intel Nehalem and Westmere. I misssed those in the earlier patches. Events were tested using libpfm4 perf_examples. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <4d6e6b02.815bdf0a.637b.07a7@mx.google.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 4 ++ arch/x86/kernel/cpu/perf_event_intel_ds.c | 59 +++++++++++++++-------- 2 files changed, 42 insertions(+), 21 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index ba8aad157b87..c3ce053ecb46 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1137,6 +1137,7 @@ static __init int intel_pmu_init(void) intel_pmu_lbr_init_core(); x86_pmu.event_constraints = intel_core2_event_constraints; + x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints; pr_cont("Core2 events, "); break; @@ -1149,6 +1150,7 @@ static __init int intel_pmu_init(void) intel_pmu_lbr_init_nhm(); x86_pmu.event_constraints = intel_nehalem_event_constraints; + x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints; x86_pmu.enable_all = intel_pmu_nhm_enable_all; pr_cont("Nehalem events, "); break; @@ -1160,6 +1162,7 @@ static __init int intel_pmu_init(void) intel_pmu_lbr_init_atom(); x86_pmu.event_constraints = intel_gen_event_constraints; + x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints; pr_cont("Atom events, "); break; @@ -1172,6 +1175,7 @@ static __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_westmere_event_constraints; x86_pmu.enable_all = intel_pmu_nhm_enable_all; + x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints; pr_cont("Westmere events, "); break; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 825199834885..b95c66ae4a2a 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -361,30 +361,50 @@ static int intel_pmu_drain_bts_buffer(void) /* * PEBS */ - -static struct event_constraint intel_core_pebs_events[] = { - PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INSTR_RETIRED.ANY */ +static struct event_constraint intel_core2_pebs_event_constraints[] = { + PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ PEBS_EVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */ PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ PEBS_EVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ - PEBS_EVENT_CONSTRAINT(0x01cb, 0x1), /* MEM_LOAD_RETIRED.L1D_MISS */ - PEBS_EVENT_CONSTRAINT(0x02cb, 0x1), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */ - PEBS_EVENT_CONSTRAINT(0x04cb, 0x1), /* MEM_LOAD_RETIRED.L2_MISS */ - PEBS_EVENT_CONSTRAINT(0x08cb, 0x1), /* MEM_LOAD_RETIRED.L2_LINE_MISS */ - PEBS_EVENT_CONSTRAINT(0x10cb, 0x1), /* MEM_LOAD_RETIRED.DTLB_MISS */ + INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ EVENT_CONSTRAINT_END }; -static struct event_constraint intel_nehalem_pebs_events[] = { - PEBS_EVENT_CONSTRAINT(0x00c0, 0xf), /* INSTR_RETIRED.ANY */ - PEBS_EVENT_CONSTRAINT(0xfec1, 0xf), /* X87_OPS_RETIRED.ANY */ - PEBS_EVENT_CONSTRAINT(0x00c5, 0xf), /* BR_INST_RETIRED.MISPRED */ - PEBS_EVENT_CONSTRAINT(0x1fc7, 0xf), /* SIMD_INST_RETURED.ANY */ - PEBS_EVENT_CONSTRAINT(0x01cb, 0xf), /* MEM_LOAD_RETIRED.L1D_MISS */ - PEBS_EVENT_CONSTRAINT(0x02cb, 0xf), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */ - PEBS_EVENT_CONSTRAINT(0x04cb, 0xf), /* MEM_LOAD_RETIRED.L2_MISS */ - PEBS_EVENT_CONSTRAINT(0x08cb, 0xf), /* MEM_LOAD_RETIRED.L2_LINE_MISS */ - PEBS_EVENT_CONSTRAINT(0x10cb, 0xf), /* MEM_LOAD_RETIRED.DTLB_MISS */ +static struct event_constraint intel_atom_pebs_event_constraints[] = { + PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ + PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */ + INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_nehalem_pebs_event_constraints[] = { + INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ + PEBS_EVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ + INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INST_RETIRED.ANY */ + INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + PEBS_EVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */ + INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ + PEBS_EVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ + INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_westmere_pebs_event_constraints[] = { + INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ + PEBS_EVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ + INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INSTR_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */ + + INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ + PEBS_EVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ + INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ EVENT_CONSTRAINT_END }; @@ -733,20 +753,17 @@ static void intel_ds_init(void) printk(KERN_CONT "PEBS fmt0%c, ", pebs_type); x86_pmu.pebs_record_size = sizeof(struct pebs_record_core); x86_pmu.drain_pebs = intel_pmu_drain_pebs_core; - x86_pmu.pebs_constraints = intel_core_pebs_events; break; case 1: printk(KERN_CONT "PEBS fmt1%c, ", pebs_type); x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm); x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; - x86_pmu.pebs_constraints = intel_nehalem_pebs_events; break; default: printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type); x86_pmu.pebs = 0; - break; } } } From a7e3ed1e470116c9d12c2f778431a481a6be8ab6 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 3 Mar 2011 10:34:47 +0800 Subject: [PATCH 538/706] perf: Add support for supplementary event registers Change logs against Andi's original version: - Extends perf_event_attr:config to config{,1,2} (Peter Zijlstra) - Fixed a major event scheduling issue. There cannot be a ref++ on an event that has already done ref++ once and without calling put_constraint() in between. (Stephane Eranian) - Use thread_cpumask for percore allocation. (Lin Ming) - Use MSR names in the extra reg lists. (Lin Ming) - Remove redundant "c = NULL" in intel_percore_constraints - Fix comment of perf_event_attr::config1 Intel Nehalem/Westmere have a special OFFCORE_RESPONSE event that can be used to monitor any offcore accesses from a core. This is a very useful event for various tunings, and it's also needed to implement the generic LLC-* events correctly. Unfortunately this event requires programming a mask in a separate register. And worse this separate register is per core, not per CPU thread. This patch: - Teaches perf_events that OFFCORE_RESPONSE needs extra parameters. The extra parameters are passed by user space in the perf_event_attr::config1 field. - Adds support to the Intel perf_event core to schedule per core resources. This adds fairly generic infrastructure that can be also used for other per core resources. The basic code has is patterned after the similar AMD northbridge constraints code. Thanks to Stephane Eranian who pointed out some problems in the original version and suggested improvements. Signed-off-by: Andi Kleen Signed-off-by: Lin Ming Signed-off-by: Peter Zijlstra LKML-Reference: <1299119690-13991-2-git-send-email-ming.m.lin@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr-index.h | 3 + arch/x86/kernel/cpu/perf_event.c | 64 ++++++++ arch/x86/kernel/cpu/perf_event_intel.c | 198 +++++++++++++++++++++++++ include/linux/perf_event.h | 13 +- 4 files changed, 276 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 4d0dfa0d998e..d25e74cc1a50 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -47,6 +47,9 @@ #define MSR_IA32_MCG_STATUS 0x0000017a #define MSR_IA32_MCG_CTL 0x0000017b +#define MSR_OFFCORE_RSP_0 0x000001a6 +#define MSR_OFFCORE_RSP_1 0x000001a7 + #define MSR_IA32_PEBS_ENABLE 0x000003f1 #define MSR_IA32_DS_AREA 0x00000600 #define MSR_IA32_PERF_CAPABILITIES 0x00000345 diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index ea03c725e465..ec6a6db07332 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -93,6 +93,8 @@ struct amd_nb { struct event_constraint event_constraints[X86_PMC_IDX_MAX]; }; +struct intel_percore; + #define MAX_LBR_ENTRIES 16 struct cpu_hw_events { @@ -127,6 +129,13 @@ struct cpu_hw_events { struct perf_branch_stack lbr_stack; struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; + /* + * Intel percore register state. + * Coordinate shared resources between HT threads. + */ + int percore_used; /* Used by this CPU? */ + struct intel_percore *per_core; + /* * AMD specific bits */ @@ -177,6 +186,28 @@ struct cpu_hw_events { #define for_each_event_constraint(e, c) \ for ((e) = (c); (e)->weight; (e)++) +/* + * Extra registers for specific events. + * Some events need large masks and require external MSRs. + * Define a mapping to these extra registers. + */ +struct extra_reg { + unsigned int event; + unsigned int msr; + u64 config_mask; + u64 valid_mask; +}; + +#define EVENT_EXTRA_REG(e, ms, m, vm) { \ + .event = (e), \ + .msr = (ms), \ + .config_mask = (m), \ + .valid_mask = (vm), \ + } +#define INTEL_EVENT_EXTRA_REG(event, msr, vm) \ + EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm) +#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0) + union perf_capabilities { struct { u64 lbr_format : 6; @@ -221,6 +252,7 @@ struct x86_pmu { void (*put_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event); struct event_constraint *event_constraints; + struct event_constraint *percore_constraints; void (*quirks)(void); int perfctr_second_write; @@ -249,6 +281,11 @@ struct x86_pmu { */ unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ int lbr_nr; /* hardware stack size */ + + /* + * Extra registers for events + */ + struct extra_reg *extra_regs; }; static struct x86_pmu x86_pmu __read_mostly; @@ -341,6 +378,31 @@ static inline unsigned int x86_pmu_event_addr(int index) return x86_pmu.perfctr + x86_pmu_addr_offset(index); } +/* + * Find and validate any extra registers to set up. + */ +static int x86_pmu_extra_regs(u64 config, struct perf_event *event) +{ + struct extra_reg *er; + + event->hw.extra_reg = 0; + event->hw.extra_config = 0; + + if (!x86_pmu.extra_regs) + return 0; + + for (er = x86_pmu.extra_regs; er->msr; er++) { + if (er->event != (config & er->config_mask)) + continue; + if (event->attr.config1 & ~er->valid_mask) + return -EINVAL; + event->hw.extra_reg = er->msr; + event->hw.extra_config = event->attr.config1; + break; + } + return 0; +} + static atomic_t active_events; static DEFINE_MUTEX(pmc_reserve_mutex); @@ -665,6 +727,8 @@ static void x86_pmu_disable(struct pmu *pmu) static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, u64 enable_mask) { + if (hwc->extra_reg) + wrmsrl(hwc->extra_reg, hwc->extra_config); wrmsrl(hwc->config_base, hwc->config | enable_mask); } diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index c3ce053ecb46..13cb6cf013f6 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1,5 +1,27 @@ #ifdef CONFIG_CPU_SUP_INTEL +#define MAX_EXTRA_REGS 2 + +/* + * Per register state. + */ +struct er_account { + int ref; /* reference count */ + unsigned int extra_reg; /* extra MSR number */ + u64 extra_config; /* extra MSR config */ +}; + +/* + * Per core state + * This used to coordinate shared registers for HT threads. + */ +struct intel_percore { + raw_spinlock_t lock; /* protect structure */ + struct er_account regs[MAX_EXTRA_REGS]; + int refcnt; /* number of threads */ + unsigned core_id; +}; + /* * Intel PerfMon, used on Core and later. */ @@ -64,6 +86,18 @@ static struct event_constraint intel_nehalem_event_constraints[] = EVENT_CONSTRAINT_END }; +static struct extra_reg intel_nehalem_extra_regs[] = +{ + INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff), + EVENT_EXTRA_END +}; + +static struct event_constraint intel_nehalem_percore_constraints[] = +{ + INTEL_EVENT_CONSTRAINT(0xb7, 0), + EVENT_CONSTRAINT_END +}; + static struct event_constraint intel_westmere_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ @@ -89,6 +123,20 @@ static struct event_constraint intel_snb_event_constraints[] = EVENT_CONSTRAINT_END }; +static struct extra_reg intel_westmere_extra_regs[] = +{ + INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff), + INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff), + EVENT_EXTRA_END +}; + +static struct event_constraint intel_westmere_percore_constraints[] = +{ + INTEL_EVENT_CONSTRAINT(0xb7, 0), + INTEL_EVENT_CONSTRAINT(0xbb, 0), + EVENT_CONSTRAINT_END +}; + static struct event_constraint intel_gen_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ @@ -906,6 +954,67 @@ intel_bts_constraints(struct perf_event *event) return NULL; } +static struct event_constraint * +intel_percore_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + unsigned int e = hwc->config & ARCH_PERFMON_EVENTSEL_EVENT; + struct event_constraint *c; + struct intel_percore *pc; + struct er_account *era; + int i; + int free_slot; + int found; + + if (!x86_pmu.percore_constraints || hwc->extra_alloc) + return NULL; + + for (c = x86_pmu.percore_constraints; c->cmask; c++) { + if (e != c->code) + continue; + + /* + * Allocate resource per core. + */ + pc = cpuc->per_core; + if (!pc) + break; + c = &emptyconstraint; + raw_spin_lock(&pc->lock); + free_slot = -1; + found = 0; + for (i = 0; i < MAX_EXTRA_REGS; i++) { + era = &pc->regs[i]; + if (era->ref > 0 && hwc->extra_reg == era->extra_reg) { + /* Allow sharing same config */ + if (hwc->extra_config == era->extra_config) { + era->ref++; + cpuc->percore_used = 1; + hwc->extra_alloc = 1; + c = NULL; + } + /* else conflict */ + found = 1; + break; + } else if (era->ref == 0 && free_slot == -1) + free_slot = i; + } + if (!found && free_slot != -1) { + era = &pc->regs[free_slot]; + era->ref = 1; + era->extra_reg = hwc->extra_reg; + era->extra_config = hwc->extra_config; + cpuc->percore_used = 1; + hwc->extra_alloc = 1; + c = NULL; + } + raw_spin_unlock(&pc->lock); + return c; + } + + return NULL; +} + static struct event_constraint * intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { @@ -919,9 +1028,51 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event if (c) return c; + c = intel_percore_constraints(cpuc, event); + if (c) + return c; + return x86_get_event_constraints(cpuc, event); } +static void intel_put_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + struct extra_reg *er; + struct intel_percore *pc; + struct er_account *era; + struct hw_perf_event *hwc = &event->hw; + int i, allref; + + if (!cpuc->percore_used) + return; + + for (er = x86_pmu.extra_regs; er->msr; er++) { + if (er->event != (hwc->config & er->config_mask)) + continue; + + pc = cpuc->per_core; + raw_spin_lock(&pc->lock); + for (i = 0; i < MAX_EXTRA_REGS; i++) { + era = &pc->regs[i]; + if (era->ref > 0 && + era->extra_config == hwc->extra_config && + era->extra_reg == er->msr) { + era->ref--; + hwc->extra_alloc = 0; + break; + } + } + allref = 0; + for (i = 0; i < MAX_EXTRA_REGS; i++) + allref += pc->regs[i].ref; + if (allref == 0) + cpuc->percore_used = 0; + raw_spin_unlock(&pc->lock); + break; + } +} + static int intel_pmu_hw_config(struct perf_event *event) { int ret = x86_pmu_hw_config(event); @@ -993,11 +1144,43 @@ static __initconst const struct x86_pmu core_pmu = { */ .max_period = (1ULL << 31) - 1, .get_event_constraints = intel_get_event_constraints, + .put_event_constraints = intel_put_event_constraints, .event_constraints = intel_core_event_constraints, }; +static int intel_pmu_cpu_prepare(int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + + cpuc->per_core = kzalloc_node(sizeof(struct intel_percore), + GFP_KERNEL, cpu_to_node(cpu)); + if (!cpuc->per_core) + return NOTIFY_BAD; + + raw_spin_lock_init(&cpuc->per_core->lock); + cpuc->per_core->core_id = -1; + return NOTIFY_OK; +} + static void intel_pmu_cpu_starting(int cpu) { + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + int core_id = topology_core_id(cpu); + int i; + + for_each_cpu(i, topology_thread_cpumask(cpu)) { + struct intel_percore *pc = per_cpu(cpu_hw_events, i).per_core; + + if (pc && pc->core_id == core_id) { + kfree(cpuc->per_core); + cpuc->per_core = pc; + break; + } + } + + cpuc->per_core->core_id = core_id; + cpuc->per_core->refcnt++; + init_debug_store_on_cpu(cpu); /* * Deal with CPUs that don't clear their LBRs on power-up. @@ -1007,6 +1190,15 @@ static void intel_pmu_cpu_starting(int cpu) static void intel_pmu_cpu_dying(int cpu) { + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + struct intel_percore *pc = cpuc->per_core; + + if (pc) { + if (pc->core_id == -1 || --pc->refcnt == 0) + kfree(pc); + cpuc->per_core = NULL; + } + fini_debug_store_on_cpu(cpu); } @@ -1031,7 +1223,9 @@ static __initconst const struct x86_pmu intel_pmu = { */ .max_period = (1ULL << 31) - 1, .get_event_constraints = intel_get_event_constraints, + .put_event_constraints = intel_put_event_constraints, + .cpu_prepare = intel_pmu_cpu_prepare, .cpu_starting = intel_pmu_cpu_starting, .cpu_dying = intel_pmu_cpu_dying, }; @@ -1151,7 +1345,9 @@ static __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_nehalem_event_constraints; x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints; + x86_pmu.percore_constraints = intel_nehalem_percore_constraints; x86_pmu.enable_all = intel_pmu_nhm_enable_all; + x86_pmu.extra_regs = intel_nehalem_extra_regs; pr_cont("Nehalem events, "); break; @@ -1174,8 +1370,10 @@ static __init int intel_pmu_init(void) intel_pmu_lbr_init_nhm(); x86_pmu.event_constraints = intel_westmere_event_constraints; + x86_pmu.percore_constraints = intel_westmere_percore_constraints; x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints; + x86_pmu.extra_regs = intel_westmere_extra_regs; pr_cont("Westmere events, "); break; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 8ceb5a6fd9c9..614615b8d42b 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -225,8 +225,14 @@ struct perf_event_attr { }; __u32 bp_type; - __u64 bp_addr; - __u64 bp_len; + union { + __u64 bp_addr; + __u64 config1; /* extension of config */ + }; + union { + __u64 bp_len; + __u64 config2; /* extension of config1 */ + }; }; /* @@ -541,6 +547,9 @@ struct hw_perf_event { unsigned long event_base; int idx; int last_cpu; + unsigned int extra_reg; + u64 extra_config; + int extra_alloc; }; struct { /* software */ struct hrtimer hrtimer; From e994d7d23a0bae34cd28834e85522ed4e782faf7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 3 Mar 2011 10:34:48 +0800 Subject: [PATCH 539/706] perf: Fix LLC-* events on Intel Nehalem/Westmere On Intel Nehalem and Westmere CPUs the generic perf LLC-* events count the L2 caches, not the real L3 LLC - this was inconsistent with behavior on other CPUs. Fixing this requires the use of the special OFFCORE_RESPONSE events which need a separate mask register. This has been implemented by the previous patch, now use this infrastructure to set correct events for the LLC-* on Nehalem and Westmere. Signed-off-by: Andi Kleen Signed-off-by: Lin Ming Signed-off-by: Peter Zijlstra LKML-Reference: <1299119690-13991-3-git-send-email-ming.m.lin@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 15 +++-- arch/x86/kernel/cpu/perf_event_intel.c | 81 ++++++++++++++++++++++---- 2 files changed, 79 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index ec6a6db07332..4d6ce5d612da 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -310,6 +310,10 @@ static u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; +static u64 __read_mostly hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; /* * Propagate event elapsed time into the generic event. @@ -524,8 +528,9 @@ static inline int x86_pmu_initialized(void) } static inline int -set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr) +set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event) { + struct perf_event_attr *attr = &event->attr; unsigned int cache_type, cache_op, cache_result; u64 config, val; @@ -552,8 +557,8 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr) return -EINVAL; hwc->config |= val; - - return 0; + attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result]; + return x86_pmu_extra_regs(val, event); } static int x86_setup_perfctr(struct perf_event *event) @@ -578,10 +583,10 @@ static int x86_setup_perfctr(struct perf_event *event) } if (attr->type == PERF_TYPE_RAW) - return 0; + return x86_pmu_extra_regs(event->attr.config, event); if (attr->type == PERF_TYPE_HW_CACHE) - return set_ext_hw_attr(hwc, attr); + return set_ext_hw_attr(hwc, event); if (attr->config >= x86_pmu.max_events) return -EINVAL; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 13cb6cf013f6..6e9b6763ff48 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -285,16 +285,26 @@ static __initconst const u64 westmere_hw_cache_event_ids }, [ C(LL ) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ - [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ + /* OFFCORE_RESPONSE_0.ANY_DATA.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE_1.ANY_DATA.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01bb, }, + /* + * Use RFO, not WRITEBACK, because a write miss would typically occur + * on RFO. + */ [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ - [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ + /* OFFCORE_RESPONSE_1.ANY_RFO.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01bb, + /* OFFCORE_RESPONSE_0.ANY_RFO.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, }, [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ - [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ + /* OFFCORE_RESPONSE_0.PREFETCH.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE_1.PREFETCH.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01bb, }, }, [ C(DTLB) ] = { @@ -341,6 +351,39 @@ static __initconst const u64 westmere_hw_cache_event_ids }, }; +/* + * OFFCORE_RESPONSE MSR bits (subset), See IA32 SDM Vol 3 30.6.1.3 + */ + +#define DMND_DATA_RD (1 << 0) +#define DMND_RFO (1 << 1) +#define DMND_WB (1 << 3) +#define PF_DATA_RD (1 << 4) +#define PF_DATA_RFO (1 << 5) +#define RESP_UNCORE_HIT (1 << 8) +#define RESP_MISS (0xf600) /* non uncore hit */ + +static __initconst const u64 nehalem_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = DMND_DATA_RD|RESP_UNCORE_HIT, + [ C(RESULT_MISS) ] = DMND_DATA_RD|RESP_MISS, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = DMND_RFO|DMND_WB|RESP_UNCORE_HIT, + [ C(RESULT_MISS) ] = DMND_RFO|DMND_WB|RESP_MISS, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = PF_DATA_RD|PF_DATA_RFO|RESP_UNCORE_HIT, + [ C(RESULT_MISS) ] = PF_DATA_RD|PF_DATA_RFO|RESP_MISS, + }, + } +}; + static __initconst const u64 nehalem_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -376,16 +419,26 @@ static __initconst const u64 nehalem_hw_cache_event_ids }, [ C(LL ) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ - [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ + /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, }, + /* + * Use RFO, not WRITEBACK, because a write miss would typically occur + * on RFO. + */ [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ - [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ + /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, }, [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ - [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ + /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, }, }, [ C(DTLB) ] = { @@ -1340,6 +1393,8 @@ static __init int intel_pmu_init(void) case 46: /* 45 nm nehalem-ex, "Beckton" */ memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); intel_pmu_lbr_init_nhm(); @@ -1366,6 +1421,8 @@ static __init int intel_pmu_init(void) case 44: /* 32 nm nehalem, "Gulftown" */ memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); intel_pmu_lbr_init_nhm(); From 51b361b4009f4e19ae68d2bcbb35e254e91b6054 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 4 Mar 2011 14:49:28 +0100 Subject: [PATCH 540/706] x86-64, NUMA: Fix numa_emulation code with node0 without RAM On one system that does not have RAM on node0. When numa_emulation is compiled in, and 1. boot system without numa=fake... 2. or boot system with numa=fake=128 to make emulation fail will get: [ 0.092026] ------------[ cut here ]------------ [ 0.096005] kernel BUG at arch/x86/mm/numa_emulation.c:439! [ 0.096005] invalid opcode: 0000 [#1] SMP [ 0.096005] last sysfs file: [ 0.096005] CPU 0 [ 0.096005] Modules linked in: [ 0.096005] [ 0.096005] Pid: 0, comm: swapper Not tainted 2.6.38-rc6-tip-yh-03869-gcb0491d-dirty #684 Sun Microsystems Sun Fire X4240/Sun Fire X4240 [ 0.096005] RIP: 0010:[] [] numa_add_cpu+0x56/0xcf [ 0.096005] RSP: 0000:ffffffff82437ed8 EFLAGS: 00010246 ... [ 0.096005] Call Trace: [ 0.096005] [] identify_cpu+0x2d7/0x2df [ 0.096005] [] identify_boot_cpu+0x10/0x30 [ 0.096005] [] check_bugs+0x9/0x2d [ 0.096005] [] start_kernel+0x3d7/0x3f1 [ 0.096005] [] x86_64_start_reservations+0x9c/0xa0 [ 0.096005] [] x86_64_start_kernel+0x1dd/0x1e8 [ 0.096005] Code: 74 06 48 8d 04 90 eb 0f 48 c7 c0 30 d9 00 00 48 03 04 d5 90 0f 60 82 8b 00 83 f8 ff 74 0d 0f a3 05 8b 7e 92 00 19 d2 85 d2 75 02 <0f> 0b 48 98 be 00 01 00 00 48 c7 c7 e0 44 60 82 44 8b 2c 85 e0 [ 0.096005] RIP [] numa_add_cpu+0x56/0xcf [ 0.096005] RSP [ 0.096026] ---[ end trace a7919e7f17c0a725 ]--- We need to use early_cpu_to_node() directly, because numa_cpu_node() will return node0 that is not onlined. Signed-off-by: Yinghai Lu Signed-off-by: Tejun Heo --- arch/x86/mm/numa_emulation.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index aeecea93820f..75b31dc0f329 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -417,9 +417,7 @@ void __cpuinit numa_add_cpu(int cpu) { int physnid, nid; - nid = numa_cpu_node(cpu); - if (nid == NUMA_NO_NODE) - nid = early_cpu_to_node(cpu); + nid = early_cpu_to_node(cpu); BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); physnid = emu_nid_to_phys[nid]; From c09cedf4f75f1e47ea17f55e18e9cfb81bec8575 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Fri, 4 Mar 2011 15:17:21 +0100 Subject: [PATCH 541/706] x86-64, NUMA: Clean up initmem_init() This patch cleans initmem_init() so that it is more readable and doesn't use an unnecessary array of function pointers to convolute the flow of the code. It also makes it obvious that dummy_numa_init() will always succeed (and documents that requirement) so that the existing BUG() is never actually reached. No functional change. -tj: Updated comment for dummy_numa_init() slightly. Signed-off-by: David Rientjes Signed-off-by: Tejun Heo --- arch/x86/mm/numa_64.c | 98 +++++++++++++++++++++++++------------------ 1 file changed, 57 insertions(+), 41 deletions(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 86491ba568d9..9ec0f209a6a4 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -562,6 +562,15 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) return 0; } +/** + * dummy_numma_init - Fallback dummy NUMA init + * + * Used if there's no underlying NUMA architecture, NUMA initialization + * fails, or NUMA is disabled on the command line. + * + * Must online at least one node and add memory blocks that cover all + * allowed memory. This function must not fail. + */ static int __init dummy_numa_init(void) { printk(KERN_INFO "%s\n", @@ -575,57 +584,64 @@ static int __init dummy_numa_init(void) return 0; } +static int __init numa_init(int (*init_func)(void)) +{ + int i; + int ret; + + for (i = 0; i < MAX_LOCAL_APIC; i++) + set_apicid_to_node(i, NUMA_NO_NODE); + + nodes_clear(numa_nodes_parsed); + nodes_clear(node_possible_map); + nodes_clear(node_online_map); + memset(&numa_meminfo, 0, sizeof(numa_meminfo)); + remove_all_active_ranges(); + numa_reset_distance(); + + ret = init_func(); + if (ret < 0) + return ret; + ret = numa_cleanup_meminfo(&numa_meminfo); + if (ret < 0) + return ret; + + numa_emulation(&numa_meminfo, numa_distance_cnt); + + ret = numa_register_memblks(&numa_meminfo); + if (ret < 0) + return ret; + + for (i = 0; i < nr_cpu_ids; i++) { + int nid = early_cpu_to_node(i); + + if (nid == NUMA_NO_NODE) + continue; + if (!node_online(nid)) + numa_clear_node(i); + } + numa_init_array(); + return 0; +} + void __init initmem_init(void) { - int (*numa_init[])(void) = { [2] = dummy_numa_init }; - int i, j; + int ret; if (!numa_off) { #ifdef CONFIG_ACPI_NUMA - numa_init[0] = x86_acpi_numa_init; + ret = numa_init(x86_acpi_numa_init); + if (!ret) + return; #endif #ifdef CONFIG_AMD_NUMA - numa_init[1] = amd_numa_init; + ret = numa_init(amd_numa_init); + if (!ret) + return; #endif } - for (i = 0; i < ARRAY_SIZE(numa_init); i++) { - if (!numa_init[i]) - continue; - - for (j = 0; j < MAX_LOCAL_APIC; j++) - set_apicid_to_node(j, NUMA_NO_NODE); - - nodes_clear(numa_nodes_parsed); - nodes_clear(node_possible_map); - nodes_clear(node_online_map); - memset(&numa_meminfo, 0, sizeof(numa_meminfo)); - remove_all_active_ranges(); - numa_reset_distance(); - - if (numa_init[i]() < 0) - continue; - - if (numa_cleanup_meminfo(&numa_meminfo) < 0) - continue; - - numa_emulation(&numa_meminfo, numa_distance_cnt); - - if (numa_register_memblks(&numa_meminfo) < 0) - continue; - - for (j = 0; j < nr_cpu_ids; j++) { - int nid = early_cpu_to_node(j); - - if (nid == NUMA_NO_NODE) - continue; - if (!node_online(nid)) - numa_clear_node(j); - } - numa_init_array(); - return; - } - BUG(); + numa_init(dummy_numa_init); } unsigned long __init numa_free_all_bootmem(void) From 078a198906c796981f93ff100c210506e91aade5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 4 Mar 2011 16:32:02 +0100 Subject: [PATCH 542/706] x86-64, NUMA: Don't assume phys node 0 is always online in numa_emulation() Undetermined entries in emu_nid_to_phys[] are filled with zero assuming that physical node 0 is always online; however, this might not be true depending on hardware configuration. Find a physical node which is actually online and use it instead. Signed-off-by: Tejun Heo Reported-by: David Rientjes LKML-Reference: --- arch/x86/mm/numa_emulation.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index 75b31dc0f329..3696be0c2204 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -301,6 +301,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) const u64 max_addr = max_pfn << PAGE_SHIFT; u8 *phys_dist = NULL; size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); + int dfl_phys_nid; int i, j, ret; if (!emu_cmdline) @@ -357,6 +358,19 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) node_distance(i, j); } + /* determine the default phys nid to use for unmapped nodes */ + dfl_phys_nid = NUMA_NO_NODE; + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { + if (emu_nid_to_phys[i] != NUMA_NO_NODE) { + dfl_phys_nid = emu_nid_to_phys[i]; + break; + } + } + if (dfl_phys_nid == NUMA_NO_NODE) { + pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n"); + goto no_emu; + } + /* commit */ *numa_meminfo = ei; @@ -377,7 +391,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) /* make sure all emulated nodes are mapped to a physical node */ for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) if (emu_nid_to_phys[i] == NUMA_NO_NODE) - emu_nid_to_phys[i] = 0; + emu_nid_to_phys[i] = dfl_phys_nid; /* * Transform distance table. numa_set_distance() ignores all From ba74f4d7e5125d04d453b4af69c53c533e6feb80 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sun, 9 Jan 2011 18:09:51 -0800 Subject: [PATCH 543/706] rcu: call __rcu_read_unlock() in exit_rcu for tiny RCU Using __rcu_read_lock() in place of rcu_read_lock() leaves any debug state as it really should be, namely with the lock still held. Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcutiny_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 015abaea962a..3cb8e362e883 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -852,7 +852,7 @@ void exit_rcu(void) if (t->rcu_read_lock_nesting == 0) return; t->rcu_read_lock_nesting = 1; - rcu_read_unlock(); + __rcu_read_unlock(); } #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ From 37743de384951ef97c040e69039820c987849501 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Mon, 10 Jan 2011 21:43:13 +0100 Subject: [PATCH 544/706] rcutorture: Get rid of duplicate sched.h include linux/sched.h is included twice in kernel/rcutorture.c - once is enough. Signed-off-by: Jesper Juhl Reviewed-by: Josh Triplett Signed-off-by: Paul E. McKenney --- kernel/rcutorture.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 89613f97ff26..c224da41890c 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -47,7 +47,6 @@ #include #include #include -#include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney and " From fea651267e52a88e7b81e01b6717d968254b6ddb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 23 Jan 2011 22:35:45 -0800 Subject: [PATCH 545/706] rcu: add documentation saying which RCU flavor to choose Reported-by: Paul Mackerras Signed-off-by: Paul E. McKenney --- Documentation/RCU/whatisRCU.txt | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index cfaac34c4557..6ef692667e2f 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -849,6 +849,37 @@ All: lockdep-checked RCU-protected pointer access See the comment headers in the source code (or the docbook generated from them) for more information. +However, given that there are no fewer than four families of RCU APIs +in the Linux kernel, how do you choose which one to use? The following +list can be helpful: + +a. Will readers need to block? If so, you need SRCU. + +b. What about the -rt patchset? If readers would need to block + in an non-rt kernel, you need SRCU. If readers would block + in a -rt kernel, but not in a non-rt kernel, SRCU is not + necessary. + +c. Do you need to treat NMI handlers, hardirq handlers, + and code segments with preemption disabled (whether + via preempt_disable(), local_irq_save(), local_bh_disable(), + or some other mechanism) as if they were explicit RCU readers? + If so, you need RCU-sched. + +d. Do you need RCU grace periods to complete even in the face + of softirq monopolization of one or more of the CPUs? For + example, is your code subject to network-based denial-of-service + attacks? If so, you need RCU-bh. + +e. Is your workload too update-intensive for normal use of + RCU, but inappropriate for other synchronization mechanisms? + If so, consider SLAB_DESTROY_BY_RCU. But please be careful! + +f. Otherwise, use RCU. + +Of course, this all assumes that you have determined that RCU is in fact +the right tool for your job. + 8. ANSWERS TO QUICK QUIZZES From fe8e64071a4ff5925ced4f33cb32ba090a04649c Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Sun, 30 Jan 2011 18:23:08 +0800 Subject: [PATCH 546/706] rcupdate: remove dead code DEBUG_OBJECTS_RCU_HEAD depends on PREEMPT, so #ifndef CONFIG_PREEMPT is totally useless in kernel/rcupdate.c. Signed-off-by: WANG Cong Cc: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcupdate.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index a23a57a976d1..afd21d17c081 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -215,10 +215,6 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) * If we detect that we are nested in a RCU read-side critical * section, we should simply fail, otherwise we would deadlock. */ -#ifndef CONFIG_PREEMPT - WARN_ON(1); - return 0; -#else if (rcu_preempt_depth() != 0 || preempt_count() != 0 || irqs_disabled()) { WARN_ON(1); @@ -229,7 +225,6 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) rcu_barrier_bh(); debug_object_free(head, &rcuhead_debug_descr); return 1; -#endif default: return 0; } From e611eecd6f9f91d74beda8cfbb3b5e02abdeb5a1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 30 Jan 2011 23:56:46 -0800 Subject: [PATCH 547/706] rcu: add comment saying why DEBUG_OBJECTS_RCU_HEAD depends on PREEMPT. The build will break if you change the Kconfig to allow DEBUG_OBJECTS_RCU_HEAD and !PREEMPT, so document the reasoning near where the breakage would occur. Signed-off-by: Paul E. McKenney --- kernel/rcupdate.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index afd21d17c081..f3240e987928 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -214,6 +214,11 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) * Ensure that queued callbacks are all executed. * If we detect that we are nested in a RCU read-side critical * section, we should simply fail, otherwise we would deadlock. + * Note that the machinery to reliably determine whether + * or not we are in an RCU read-side critical section + * exists only in the preemptible RCU implementations + * (TINY_PREEMPT_RCU and TREE_PREEMPT_RCU), which is why + * DEBUG_OBJECTS_RCU_HEAD is disallowed if !PREEMPT. */ if (rcu_preempt_depth() != 0 || preempt_count() != 0 || irqs_disabled()) { From 241e6663b5151733294d1a230a3fd8a4d32e187f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 Feb 2011 16:54:50 -0800 Subject: [PATCH 548/706] smp: Document transitivity for memory barriers. Transitivity is guaranteed only for full memory barriers (smp_mb()). Signed-off-by: Paul E. McKenney --- Documentation/memory-barriers.txt | 58 +++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index 631ad2f1b229..f0d3a8026a56 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -21,6 +21,7 @@ Contents: - SMP barrier pairing. - Examples of memory barrier sequences. - Read memory barriers vs load speculation. + - Transitivity (*) Explicit kernel barriers. @@ -959,6 +960,63 @@ the speculation will be cancelled and the value reloaded: retrieved : : +-------+ +TRANSITIVITY +------------ + +Transitivity is a deeply intuitive notion about ordering that is not +always provided by real computer systems. The following example +demonstrates transitivity (also called "cumulativity"): + + CPU 1 CPU 2 CPU 3 + ======================= ======================= ======================= + { X = 0, Y = 0 } + STORE X=1 LOAD X STORE Y=1 + + LOAD Y LOAD X + +Suppose that CPU 2's load from X returns 1 and its load from Y returns 0. +This indicates that CPU 2's load from X in some sense follows CPU 1's +store to X and that CPU 2's load from Y in some sense preceded CPU 3's +store to Y. The question is then "Can CPU 3's load from X return 0?" + +Because CPU 2's load from X in some sense came after CPU 1's store, it +is natural to expect that CPU 3's load from X must therefore return 1. +This expectation is an example of transitivity: if a load executing on +CPU A follows a load from the same variable executing on CPU B, then +CPU A's load must either return the same value that CPU B's load did, +or must return some later value. + +In the Linux kernel, use of general memory barriers guarantees +transitivity. Therefore, in the above example, if CPU 2's load from X +returns 1 and its load from Y returns 0, then CPU 3's load from X must +also return 1. + +However, transitivity is -not- guaranteed for read or write barriers. +For example, suppose that CPU 2's general barrier in the above example +is changed to a read barrier as shown below: + + CPU 1 CPU 2 CPU 3 + ======================= ======================= ======================= + { X = 0, Y = 0 } + STORE X=1 LOAD X STORE Y=1 + + LOAD Y LOAD X + +This substitution destroys transitivity: in this example, it is perfectly +legal for CPU 2's load from X to return 1, its load from Y to return 0, +and CPU 3's load from X to return 0. + +The key point is that although CPU 2's read barrier orders its pair +of loads, it does not guarantee to order CPU 1's store. Therefore, if +this example runs on a system where CPUs 1 and 2 share a store buffer +or a level of cache, CPU 2 might have early access to CPU 1's writes. +General barriers are therefore required to ensure that all CPUs agree +on the combined order of CPU 1's and CPU 2's accesses. + +To reiterate, if your code requires transitivity, use general barriers +throughout. + + ======================== EXPLICIT KERNEL BARRIERS ======================== From 6909262429b70a162e9e7053672cfd8024c9275d Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Thu, 3 Mar 2011 10:34:50 +0800 Subject: [PATCH 549/706] perf: Avoid the percore allocations if the CPU is not HT capable Signed-off-by: Lin Ming Signed-off-by: Peter Zijlstra LKML-Reference: <1299119690-13991-5-git-send-email-ming.m.lin@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/smp.h | 10 ++++++++++ arch/x86/kernel/cpu/perf_event.c | 1 + arch/x86/kernel/cpu/perf_event_intel.c | 18 ++++++++++++------ 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 1f4695136776..c1bbfa89a0e2 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -17,10 +17,20 @@ #endif #include #include +#include extern int smp_num_siblings; extern unsigned int num_processors; +static inline bool cpu_has_ht_siblings(void) +{ + bool has_siblings = false; +#ifdef CONFIG_SMP + has_siblings = cpu_has_ht && smp_num_siblings > 1; +#endif + return has_siblings; +} + DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map); DECLARE_PER_CPU(cpumask_var_t, cpu_core_map); DECLARE_PER_CPU(u16, cpu_llc_id); diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 4d6ce5d612da..26604188aa49 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -30,6 +30,7 @@ #include #include #include +#include #if 0 #undef wrmsrl diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 6e9b6763ff48..8fc2b2cee1da 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1205,6 +1205,9 @@ static int intel_pmu_cpu_prepare(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + if (!cpu_has_ht_siblings()) + return NOTIFY_OK; + cpuc->per_core = kzalloc_node(sizeof(struct intel_percore), GFP_KERNEL, cpu_to_node(cpu)); if (!cpuc->per_core) @@ -1221,6 +1224,15 @@ static void intel_pmu_cpu_starting(int cpu) int core_id = topology_core_id(cpu); int i; + init_debug_store_on_cpu(cpu); + /* + * Deal with CPUs that don't clear their LBRs on power-up. + */ + intel_pmu_lbr_reset(); + + if (!cpu_has_ht_siblings()) + return; + for_each_cpu(i, topology_thread_cpumask(cpu)) { struct intel_percore *pc = per_cpu(cpu_hw_events, i).per_core; @@ -1233,12 +1245,6 @@ static void intel_pmu_cpu_starting(int cpu) cpuc->per_core->core_id = core_id; cpuc->per_core->refcnt++; - - init_debug_store_on_cpu(cpu); - /* - * Deal with CPUs that don't clear their LBRs on power-up. - */ - intel_pmu_lbr_reset(); } static void intel_pmu_cpu_dying(int cpu) From ac23f25355ef53f3d14352fcff3c6817527a9749 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 4 Mar 2011 15:52:35 +0000 Subject: [PATCH 550/706] x86: Really print supported CPUs if PROCESSOR_SELECT=y I'm sure it was a mere oversight that the CONFIG_ prefixes are missing. Signed-off-by: Jan Beulich Cc: Dave Jones LKML-Reference: <4D7118D30200007800034F79@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 1d59834396bd..5d98c46f876d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -675,7 +675,7 @@ void __init early_cpu_init(void) const struct cpu_dev *const *cdev; int count = 0; -#ifdef PROCESSOR_SELECT +#ifdef CONFIG_PROCESSOR_SELECT printk(KERN_INFO "KERNEL supported cpus:\n"); #endif @@ -687,7 +687,7 @@ void __init early_cpu_init(void) cpu_devs[count] = cpudev; count++; -#ifdef PROCESSOR_SELECT +#ifdef CONFIG_PROCESSOR_SELECT { unsigned int j; From a03f35ceeb3d279da35c5a914ac01a4b1effb0a1 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 3 Mar 2011 16:43:03 -0300 Subject: [PATCH 551/706] perf report tui: Fix multi event switching TAB/UNTAB were not hotkeys, so didn't exit hists__browse back to hists__tui_browse_tree, allowing just the first event to be browsed. Reported-by: William Cohen Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi Cc: William Cohen LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/ui/browsers/hists.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/ui/browsers/hists.c b/tools/perf/util/ui/browsers/hists.c index 497b3c4076a3..c27ab35042f1 100644 --- a/tools/perf/util/ui/browsers/hists.c +++ b/tools/perf/util/ui/browsers/hists.c @@ -292,7 +292,8 @@ static int hist_browser__run(struct hist_browser *self, const char *title) { int key; int exit_keys[] = { 'a', '?', 'h', 'C', 'd', 'D', 'E', 't', - NEWT_KEY_ENTER, NEWT_KEY_RIGHT, NEWT_KEY_LEFT, 0, }; + NEWT_KEY_ENTER, NEWT_KEY_RIGHT, NEWT_KEY_LEFT, + NEWT_KEY_TAB, NEWT_KEY_UNTAB, 0, }; self->b.entries = &self->hists->entries; self->b.nr_entries = self->hists->nr_entries; @@ -859,6 +860,7 @@ int hists__browse(struct hists *self, const char *helpline, "E Expand all callchains\n" "d Zoom into current DSO\n" "t Zoom into current Thread\n" + "TAB/UNTAB Switch events\n" "q/CTRL+C Exit browser"); continue; case NEWT_KEY_ENTER: @@ -997,6 +999,7 @@ int hists__tui_browse_tree(struct rb_root *self, const char *help, int evidx) if (nd == first) continue; nd = rb_prev(nd); + break; default: return key; } From d7603d5122d9700fb8f36fa08b04f4e900fef059 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 4 Mar 2011 14:51:33 -0300 Subject: [PATCH 552/706] perf hists: Remove needless global col lenght calcs To support multiple events we need to do these calcs per 'struct hists' instance, and it turns out we already do that at: __hists__add_entry hists__inc_nr_entries hists__calc_col_len for all the unfiltered hist_entry instances we stash in the rb tree, so trow away the dead code. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/event.c | 57 +--------------------------------------- tools/perf/util/hist.c | 9 +++++++ tools/perf/util/symbol.c | 1 - tools/perf/util/symbol.h | 1 - 4 files changed, 10 insertions(+), 58 deletions(-) diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index fbf5754c8866..2b15c362ef56 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -424,33 +424,6 @@ int perf_event__synthesize_kernel_mmap(perf_event__handler_t process, return err; } -static void thread__comm_adjust(struct thread *self, struct hists *hists) -{ - char *comm = self->comm; - - if (!symbol_conf.col_width_list_str && !symbol_conf.field_sep && - (!symbol_conf.comm_list || - strlist__has_entry(symbol_conf.comm_list, comm))) { - u16 slen = strlen(comm); - - if (hists__new_col_len(hists, HISTC_COMM, slen)) - hists__set_col_len(hists, HISTC_THREAD, slen + 6); - } -} - -static int thread__set_comm_adjust(struct thread *self, const char *comm, - struct hists *hists) -{ - int ret = thread__set_comm(self, comm); - - if (ret) - return ret; - - thread__comm_adjust(self, hists); - - return 0; -} - int perf_event__process_comm(union perf_event *event, struct perf_sample *sample __used, struct perf_session *session) @@ -459,8 +432,7 @@ int perf_event__process_comm(union perf_event *event, dump_printf(": %s:%d\n", event->comm.comm, event->comm.tid); - if (thread == NULL || thread__set_comm_adjust(thread, event->comm.comm, - &session->hists)) { + if (thread == NULL || thread__set_comm(thread, event->comm.comm)) { dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n"); return -1; } @@ -760,18 +732,6 @@ void thread__find_addr_location(struct thread *self, al->sym = NULL; } -static void dso__calc_col_width(struct dso *self, struct hists *hists) -{ - if (!symbol_conf.col_width_list_str && !symbol_conf.field_sep && - (!symbol_conf.dso_list || - strlist__has_entry(symbol_conf.dso_list, self->name))) { - u16 slen = dso__name_len(self); - hists__new_col_len(hists, HISTC_DSO, slen); - } - - self->slen_calculated = 1; -} - int perf_event__preprocess_sample(const union perf_event *event, struct perf_session *session, struct addr_location *al, @@ -817,23 +777,8 @@ int perf_event__preprocess_sample(const union perf_event *event, strlist__has_entry(symbol_conf.dso_list, al->map->dso->long_name))))) goto out_filtered; - /* - * We have to do this here as we may have a dso with no symbol - * hit that has a name longer than the ones with symbols - * sampled. - */ - if (!sort_dso.elide && !al->map->dso->slen_calculated) - dso__calc_col_width(al->map->dso, &session->hists); al->sym = map__find_symbol(al->map, al->addr, filter); - } else { - const unsigned int unresolved_col_width = BITS_PER_LONG / 4; - - if (hists__col_len(&session->hists, HISTC_DSO) < unresolved_col_width && - !symbol_conf.col_width_list_str && !symbol_conf.field_sep && - !symbol_conf.dso_list) - hists__set_col_len(&session->hists, HISTC_DSO, - unresolved_col_width); } if (symbol_conf.sym_list && al->sym && diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index da2899e8c6f8..f7ad6bdbc667 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -50,6 +50,15 @@ static void hists__calc_col_len(struct hists *self, struct hist_entry *h) if (h->ms.sym) hists__new_col_len(self, HISTC_SYMBOL, h->ms.sym->namelen); + else { + const unsigned int unresolved_col_width = BITS_PER_LONG / 4; + + if (hists__col_len(self, HISTC_DSO) < unresolved_col_width && + !symbol_conf.col_width_list_str && !symbol_conf.field_sep && + !symbol_conf.dso_list) + hists__set_col_len(self, HISTC_DSO, + unresolved_col_width); + } len = thread__comm_len(h->thread); if (hists__new_col_len(self, HISTC_COMM, len)) diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index ba6d48949092..00014e32c288 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -207,7 +207,6 @@ struct dso *dso__new(const char *name) dso__set_short_name(self, self->name); for (i = 0; i < MAP__NR_TYPES; ++i) self->symbols[i] = self->symbol_names[i] = RB_ROOT; - self->slen_calculated = 0; self->origin = DSO__ORIG_NOT_FOUND; self->loaded = 0; self->sorted_by_name = 0; diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index 670cd1c88f54..4d7ed09fe332 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -132,7 +132,6 @@ struct dso { struct rb_root symbol_names[MAP__NR_TYPES]; enum dso_kernel_type kernel; u8 adjust_symbols:1; - u8 slen_calculated:1; u8 has_build_id:1; u8 hit:1; u8 annotate_warned:1; From 60098917c06d154d06ce030c125266eab9e60768 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 4 Mar 2011 21:19:21 -0300 Subject: [PATCH 553/706] perf hists browser: Handle browsing empty hists tree Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/ui/browsers/hists.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/tools/perf/util/ui/browsers/hists.c b/tools/perf/util/ui/browsers/hists.c index c27ab35042f1..c98e6f81d285 100644 --- a/tools/perf/util/ui/browsers/hists.c +++ b/tools/perf/util/ui/browsers/hists.c @@ -639,6 +639,9 @@ static void ui_browser__hists_seek(struct ui_browser *self, struct rb_node *nd; bool first = true; + if (self->nr_entries == 0) + return; + switch (whence) { case SEEK_SET: nd = hists__filter_entries(rb_first(self->entries)); @@ -820,8 +823,8 @@ int hists__browse(struct hists *self, const char *helpline, hists__browser_title(self, msg, sizeof(msg), ev_name, dso_filter, thread_filter); while (1) { - const struct thread *thread; - const struct dso *dso; + const struct thread *thread = NULL; + const struct dso *dso = NULL; char *options[16]; int nr_options = 0, choice = 0, i, annotate = -2, zoom_dso = -2, zoom_thread = -2, @@ -829,8 +832,10 @@ int hists__browse(struct hists *self, const char *helpline, key = hist_browser__run(browser, msg); - thread = hist_browser__selected_thread(browser); - dso = browser->selection->map ? browser->selection->map->dso : NULL; + if (browser->he_selection != NULL) { + thread = hist_browser__selected_thread(browser); + dso = browser->selection->map ? browser->selection->map->dso : NULL; + } switch (key) { case NEWT_KEY_TAB: @@ -841,7 +846,8 @@ int hists__browse(struct hists *self, const char *helpline, */ goto out_free_stack; case 'a': - if (browser->selection->map == NULL && + if (browser->selection == NULL || + browser->selection->map == NULL || browser->selection->map->dso->annotate_warned) continue; goto do_annotate; @@ -887,7 +893,8 @@ int hists__browse(struct hists *self, const char *helpline, goto out_free_stack; } - if (browser->selection->sym != NULL && + if (browser->selection != NULL && + browser->selection->sym != NULL && !browser->selection->map->dso->annotate_warned && asprintf(&options[nr_options], "Annotate %s", browser->selection->sym->name) > 0) @@ -906,7 +913,8 @@ int hists__browse(struct hists *self, const char *helpline, (dso->kernel ? "the Kernel" : dso->short_name)) > 0) zoom_dso = nr_options++; - if (browser->selection->map != NULL && + if (browser->selection != NULL && + browser->selection->map != NULL && asprintf(&options[nr_options], "Browse map details") > 0) browse_map = nr_options++; From 3d3b5e95997208067c963923db90ed1517565d14 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 4 Mar 2011 22:29:39 -0300 Subject: [PATCH 554/706] perf evlist: Split perf_evlist__id_hash The previous situation was to receive an fd from where to read the event ID. Spin off a routine for when we have the ID handy, not having to read it from some fd. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 28 ++++++++++++++++++---------- tools/perf/util/evlist.h | 3 +++ 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 030ae7f05e03..190c64c6e266 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -106,12 +106,24 @@ void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd) evlist->nr_fds++; } -static int perf_evlist__id_hash(struct perf_evlist *evlist, struct perf_evsel *evsel, - int cpu, int thread, int fd) +void perf_evlist__id_hash(struct perf_evlist *evlist, struct perf_evsel *evsel, + int cpu, int thread, u64 id) +{ + int hash; + struct perf_sample_id *sid = SID(evsel, cpu, thread); + + sid->id = id; + sid->evsel = evsel; + hash = hash_64(sid->id, PERF_EVLIST__HLIST_BITS); + hlist_add_head(&sid->node, &evlist->heads[hash]); +} + +static int perf_evlist__id_hash_fd(struct perf_evlist *evlist, + struct perf_evsel *evsel, + int cpu, int thread, int fd) { - struct perf_sample_id *sid; u64 read_data[4] = { 0, }; - int hash, id_idx = 1; /* The first entry is the counter value */ + int id_idx = 1; /* The first entry is the counter value */ if (!(evsel->attr.read_format & PERF_FORMAT_ID) || read(fd, &read_data, sizeof(read_data)) == -1) @@ -122,11 +134,7 @@ static int perf_evlist__id_hash(struct perf_evlist *evlist, struct perf_evsel *e if (evsel->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) ++id_idx; - sid = SID(evsel, cpu, thread); - sid->id = read_data[id_idx]; - sid->evsel = evsel; - hash = hash_64(sid->id, PERF_EVLIST__HLIST_BITS); - hlist_add_head(&sid->node, &evlist->heads[hash]); + perf_evlist__id_hash(evlist, evsel, cpu, thread, read_data[id_idx]); return 0; } @@ -300,7 +308,7 @@ int perf_evlist__mmap(struct perf_evlist *evlist, int pages, bool overwrite) goto out_unmap; if ((evsel->attr.read_format & PERF_FORMAT_ID) && - perf_evlist__id_hash(evlist, evsel, cpu, thread, fd) < 0) + perf_evlist__id_hash_fd(evlist, evsel, cpu, thread, fd) < 0) goto out_unmap; } } diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index b75805aeb7e4..078d51256085 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -38,6 +38,9 @@ void perf_evlist__delete(struct perf_evlist *evlist); void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry); int perf_evlist__add_default(struct perf_evlist *evlist); +void perf_evlist__id_hash(struct perf_evlist *evlist, struct perf_evsel *evsel, + int cpu, int thread, u64 id); + int perf_evlist__alloc_pollfd(struct perf_evlist *evlist); void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd); From e248de331a452f8771eda6ed4bb30d92c82df28b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 5 Mar 2011 21:40:06 -0300 Subject: [PATCH 555/706] perf tools: Improve support for sessions with multiple events By creating an perf_evlist out of the attributes in the perf.data file header, so that we can use evlists and evsels when reading recorded sessions in addition to when we record sessions. More work is needed to allow tools to allow the user to select which events are wanted when browsing sessions, be it just one or a subset of them, aggregated or showed at the same time but with different indications on the UI to allow seeing workloads thru different views at the same time. But the overall goal/trend is to more uniformly use evsels and evlists. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 80 ++++++++++---- tools/perf/builtin-report.c | 160 +++++++++------------------- tools/perf/util/evsel.h | 2 + tools/perf/util/header.c | 31 ------ tools/perf/util/header.h | 2 - tools/perf/util/hist.c | 6 +- tools/perf/util/hist.h | 12 +-- tools/perf/util/session.c | 63 ++++++++++- tools/perf/util/session.h | 16 ++- tools/perf/util/ui/browsers/hists.c | 32 +++--- 10 files changed, 208 insertions(+), 196 deletions(-) diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 427182953fd7..695de4b5ae63 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -19,6 +19,8 @@ #include "perf.h" #include "util/debug.h" +#include "util/evlist.h" +#include "util/evsel.h" #include "util/annotate.h" #include "util/event.h" #include "util/parse-options.h" @@ -38,9 +40,13 @@ static bool print_line; static const char *sym_hist_filter; -static int hists__add_entry(struct hists *self, struct addr_location *al) +static int perf_evlist__add_sample(struct perf_evlist *evlist, + struct perf_sample *sample, + struct addr_location *al) { + struct perf_evsel *evsel; struct hist_entry *he; + int ret; if (sym_hist_filter != NULL && (al->sym == NULL || strcmp(sym_hist_filter, al->sym->name) != 0)) { @@ -53,23 +59,35 @@ static int hists__add_entry(struct hists *self, struct addr_location *al) return 0; } - he = __hists__add_entry(self, al, NULL, 1); + evsel = perf_evlist__id2evsel(evlist, sample->id); + if (evsel == NULL) { + /* + * FIXME: Propagate this back, but at least we're in a builtin, + * where exit() is allowed. ;-) + */ + ui__warning("Invalid %s file, contains samples with id not in " + "its header!\n", input_name); + exit_browser(0); + exit(1); + } + + he = __hists__add_entry(&evsel->hists, al, NULL, 1); if (he == NULL) return -ENOMEM; + ret = 0; if (he->ms.sym != NULL) { - /* - * All aggregated on the first sym_hist. - */ struct annotation *notes = symbol__annotation(he->ms.sym); if (notes->src == NULL && - symbol__alloc_hist(he->ms.sym, 1) < 0) + symbol__alloc_hist(he->ms.sym, evlist->nr_entries) < 0) return -ENOMEM; - return hist_entry__inc_addr_samples(he, 0, al->addr); + ret = hist_entry__inc_addr_samples(he, evsel->idx, al->addr); } - return 0; + evsel->hists.stats.total_period += sample->period; + hists__inc_nr_events(&evsel->hists, PERF_RECORD_SAMPLE); + return ret; } static int process_sample_event(union perf_event *event, @@ -85,7 +103,7 @@ static int process_sample_event(union perf_event *event, return -1; } - if (!al.filtered && hists__add_entry(&session->hists, &al)) { + if (!al.filtered && perf_evlist__add_sample(session->evlist, sample, &al)) { pr_warning("problem incrementing symbol count, " "skipping event\n"); return -1; @@ -100,7 +118,7 @@ static int hist_entry__tty_annotate(struct hist_entry *he, int evidx) print_line, full_paths, 0, 0); } -static void hists__find_annotations(struct hists *self) +static void hists__find_annotations(struct hists *self, int evidx) { struct rb_node *nd = rb_first(&self->entries), *next; int key = KEY_RIGHT; @@ -123,8 +141,7 @@ find_next: } if (use_browser > 0) { - /* For now all is aggregated on the first */ - key = hist_entry__tui_annotate(he, 0); + key = hist_entry__tui_annotate(he, evidx); switch (key) { case KEY_RIGHT: next = rb_next(nd); @@ -139,8 +156,7 @@ find_next: if (next != NULL) nd = next; } else { - /* For now all is aggregated on the first */ - hist_entry__tty_annotate(he, 0); + hist_entry__tty_annotate(he, evidx); nd = rb_next(nd); /* * Since we have a hist_entry per IP for the same @@ -166,6 +182,8 @@ static int __cmd_annotate(void) { int ret; struct perf_session *session; + struct perf_evsel *pos; + u64 total_nr_samples; session = perf_session__new(input_name, O_RDONLY, force, false, &event_ops); if (session == NULL) @@ -186,12 +204,36 @@ static int __cmd_annotate(void) if (verbose > 2) perf_session__fprintf_dsos(session, stdout); - hists__collapse_resort(&session->hists); - hists__output_resort(&session->hists); - hists__find_annotations(&session->hists); -out_delete: - perf_session__delete(session); + total_nr_samples = 0; + list_for_each_entry(pos, &session->evlist->entries, node) { + struct hists *hists = &pos->hists; + u32 nr_samples = hists->stats.nr_events[PERF_RECORD_SAMPLE]; + if (nr_samples > 0) { + total_nr_samples += nr_samples; + hists__collapse_resort(hists); + hists__output_resort(hists); + hists__find_annotations(hists, pos->idx); + } + } + + if (total_nr_samples == 0) { + ui__warning("The %s file has no samples!\n", input_name); + goto out_delete; + } +out_delete: + /* + * Speed up the exit process, for large files this can + * take quite a while. + * + * XXX Enable this when using valgrind or if we ever + * librarize this command. + * + * Also experiment with obstacks to see how much speed + * up we'll get here. + * + * perf_session__delete(session); + */ return ret; } diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index dddcc7ea2bec..1c399eae5f7b 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -21,6 +21,8 @@ #include "perf.h" #include "util/debug.h" +#include "util/evlist.h" +#include "util/evsel.h" #include "util/header.h" #include "util/session.h" @@ -46,39 +48,6 @@ static const char *pretty_printing_style = default_pretty_printing_style; static char callchain_default_opt[] = "fractal,0.5"; static symbol_filter_t annotate_init; -static struct hists *perf_session__hists_findnew(struct perf_session *self, - u64 event_stream, u32 type, - u64 config) -{ - struct rb_node **p = &self->hists_tree.rb_node; - struct rb_node *parent = NULL; - struct hists *iter, *new; - - while (*p != NULL) { - parent = *p; - iter = rb_entry(parent, struct hists, rb_node); - if (iter->config == config) - return iter; - - - if (config > iter->config) - p = &(*p)->rb_right; - else - p = &(*p)->rb_left; - } - - new = malloc(sizeof(struct hists)); - if (new == NULL) - return NULL; - memset(new, 0, sizeof(struct hists)); - new->event_stream = event_stream; - new->config = config; - new->type = type; - rb_link_node(&new->rb_node, parent, p); - rb_insert_color(&new->rb_node, &self->hists_tree); - return new; -} - static int perf_session__add_hist_entry(struct perf_session *session, struct addr_location *al, struct perf_sample *sample) @@ -86,8 +55,7 @@ static int perf_session__add_hist_entry(struct perf_session *session, struct symbol *parent = NULL; int err = 0; struct hist_entry *he; - struct hists *hists; - struct perf_event_attr *attr; + struct perf_evsel *evsel; if ((sort__has_parent || symbol_conf.use_callchain) && sample->callchain) { err = perf_session__resolve_callchain(session, al->thread, @@ -96,15 +64,19 @@ static int perf_session__add_hist_entry(struct perf_session *session, return err; } - attr = perf_header__find_attr(sample->id, &session->header); - if (attr) - hists = perf_session__hists_findnew(session, sample->id, attr->type, attr->config); - else - hists = perf_session__hists_findnew(session, sample->id, 0, 0); - if (hists == NULL) - return -ENOMEM; + evsel = perf_evlist__id2evsel(session->evlist, sample->id); + if (evsel == NULL) { + /* + * FIXME: Propagate this back, but at least we're in a builtin, + * where exit() is allowed. ;-) + */ + ui__warning("Invalid %s file, contains samples with id not in " + "its header!\n", input_name); + exit_browser(0); + exit(1); + } - he = __hists__add_entry(hists, al, parent, sample->period); + he = __hists__add_entry(&evsel->hists, al, parent, sample->period); if (he == NULL) return -ENOMEM; @@ -120,52 +92,30 @@ static int perf_session__add_hist_entry(struct perf_session *session, * code will not use it. */ if (al->sym != NULL && use_browser > 0) { - /* - * All aggregated on the first sym_hist. - */ struct annotation *notes = symbol__annotation(he->ms.sym); + + assert(evsel != NULL); + + err = -ENOMEM; if (notes->src == NULL && - symbol__alloc_hist(he->ms.sym, 1) < 0) - err = -ENOMEM; - else - err = hist_entry__inc_addr_samples(he, 0, al->addr); + symbol__alloc_hist(he->ms.sym, session->evlist->nr_entries) < 0) + goto out; + + err = hist_entry__inc_addr_samples(he, evsel->idx, al->addr); } + evsel->hists.stats.total_period += sample->period; + hists__inc_nr_events(&evsel->hists, PERF_RECORD_SAMPLE); +out: return err; } -static int add_event_total(struct perf_session *session, - struct perf_sample *sample, - struct perf_event_attr *attr) -{ - struct hists *hists; - - if (attr) - hists = perf_session__hists_findnew(session, sample->id, - attr->type, attr->config); - else - hists = perf_session__hists_findnew(session, sample->id, 0, 0); - - if (!hists) - return -ENOMEM; - - hists->stats.total_period += sample->period; - /* - * FIXME: add_event_total should be moved from here to - * perf_session__process_event so that the proper hist is passed to - * the event_op methods. - */ - hists__inc_nr_events(hists, PERF_RECORD_SAMPLE); - session->hists.stats.total_period += sample->period; - return 0; -} static int process_sample_event(union perf_event *event, struct perf_sample *sample, struct perf_session *session) { struct addr_location al; - struct perf_event_attr *attr; if (perf_event__preprocess_sample(event, session, &al, sample, annotate_init) < 0) { @@ -182,27 +132,17 @@ static int process_sample_event(union perf_event *event, return -1; } - attr = perf_header__find_attr(sample->id, &session->header); - - if (add_event_total(session, sample, attr)) { - pr_debug("problem adding event period\n"); - return -1; - } - return 0; } static int process_read_event(union perf_event *event, struct perf_sample *sample __used, - struct perf_session *session __used) + struct perf_session *session) { - struct perf_event_attr *attr; - - attr = perf_header__find_attr(event->read.id, &session->header); - + struct perf_evsel *evsel = perf_evlist__id2evsel(session->evlist, + event->read.id); if (show_threads) { - const char *name = attr ? __event_name(attr->type, attr->config) - : "unknown"; + const char *name = evsel ? event_name(evsel) : "unknown"; perf_read_values_add_value(&show_threads_values, event->read.pid, event->read.tid, event->read.id, @@ -211,7 +151,7 @@ static int process_read_event(union perf_event *event, } dump_printf(": %d %d %s %" PRIu64 "\n", event->read.pid, event->read.tid, - attr ? __event_name(attr->type, attr->config) : "FAIL", + evsel ? event_name(evsel) : "FAIL", event->read.value); return 0; @@ -282,21 +222,20 @@ static size_t hists__fprintf_nr_sample_events(struct hists *self, return ret + fprintf(fp, "\n#\n"); } -static int hists__tty_browse_tree(struct rb_root *tree, const char *help) +static int hists__tty_browse_tree(struct perf_evlist *evlist, const char *help) { - struct rb_node *next = rb_first(tree); + struct perf_evsel *pos; - while (next) { - struct hists *hists = rb_entry(next, struct hists, rb_node); + list_for_each_entry(pos, &evlist->entries, node) { + struct hists *hists = &pos->hists; const char *evname = NULL; if (rb_first(&hists->entries) != rb_last(&hists->entries)) - evname = __event_name(hists->type, hists->config); + evname = event_name(pos); hists__fprintf_nr_sample_events(hists, evname, stdout); hists__fprintf(hists, NULL, false, stdout); fprintf(stdout, "\n\n"); - next = rb_next(&hists->rb_node); } if (sort_order == default_sort_order && @@ -317,8 +256,9 @@ static int hists__tty_browse_tree(struct rb_root *tree, const char *help) static int __cmd_report(void) { int ret = -EINVAL; + u64 nr_samples; struct perf_session *session; - struct rb_node *next; + struct perf_evsel *pos; const char *help = "For a higher level overview, try: perf report --sort comm,dso"; signal(SIGINT, sig_handler); @@ -349,26 +289,24 @@ static int __cmd_report(void) if (verbose > 2) perf_session__fprintf_dsos(session, stdout); - next = rb_first(&session->hists_tree); + nr_samples = 0; + list_for_each_entry(pos, &session->evlist->entries, node) { + struct hists *hists = &pos->hists; - if (next == NULL) { + hists__collapse_resort(hists); + hists__output_resort(hists); + nr_samples += hists->stats.nr_events[PERF_RECORD_SAMPLE]; + } + + if (nr_samples == 0) { ui__warning("The %s file has no samples!\n", input_name); goto out_delete; } - while (next) { - struct hists *hists; - - hists = rb_entry(next, struct hists, rb_node); - hists__collapse_resort(hists); - hists__output_resort(hists); - next = rb_next(&hists->rb_node); - } - if (use_browser > 0) - hists__tui_browse_tree(&session->hists_tree, help, 0); + hists__tui_browse_tree(session->evlist, help); else - hists__tty_browse_tree(&session->hists_tree, help); + hists__tty_browse_tree(session->evlist, help); out_delete: /* diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index f6fc8f651a25..281b60e5fc7b 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -7,6 +7,7 @@ #include "types.h" #include "xyarray.h" #include "cgroup.h" +#include "hist.h" struct perf_counts_values { union { @@ -51,6 +52,7 @@ struct perf_evsel { struct xyarray *id; struct perf_counts *counts; int idx; + struct hists hists; char *name; void *priv; struct cgroup_sel *cgrp; diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 72c124dc5781..108b0db7bbef 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -969,37 +969,6 @@ bool perf_header__sample_id_all(const struct perf_header *header) return value; } -struct perf_event_attr * -perf_header__find_attr(u64 id, struct perf_header *header) -{ - int i; - - /* - * We set id to -1 if the data file doesn't contain sample - * ids. This can happen when the data file contains one type - * of event and in that case, the header can still store the - * event attribute information. Check for this and avoid - * walking through the entire list of ids which may be large. - */ - if (id == -1ULL) { - if (header->attrs > 0) - return &header->attr[0]->attr; - return NULL; - } - - for (i = 0; i < header->attrs; i++) { - struct perf_header_attr *attr = header->attr[i]; - int j; - - for (j = 0; j < attr->ids; j++) { - if (attr->id[j] == id) - return &attr->attr; - } - } - - return NULL; -} - int perf_event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id, perf_event__handler_t process, struct perf_session *session) diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index f042cebcec1e..2fab13348aab 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -85,8 +85,6 @@ int perf_header_attr__add_id(struct perf_header_attr *self, u64 id); u64 perf_header__sample_type(struct perf_header *header); bool perf_header__sample_id_all(const struct perf_header *header); -struct perf_event_attr * -perf_header__find_attr(u64 id, struct perf_header *header); void perf_header__set_feat(struct perf_header *self, int feat); void perf_header__clear_feat(struct perf_header *self, int feat); bool perf_header__has_feat(const struct perf_header *self, int feat); diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index f7ad6bdbc667..627a02e03c57 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -984,8 +984,12 @@ size_t hists__fprintf_nr_events(struct hists *self, FILE *fp) size_t ret = 0; for (i = 0; i < PERF_RECORD_HEADER_MAX; ++i) { - const char *name = perf_event__name(i); + const char *name; + if (self->stats.nr_events[i] == 0) + continue; + + name = perf_event__name(i); if (!strcmp(name, "UNKNOWN")) continue; diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 37c79089de09..0d38b435827b 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -42,13 +42,10 @@ enum hist_column { }; struct hists { - struct rb_node rb_node; struct rb_root entries; u64 nr_entries; struct events_stats stats; - u64 config; u64 event_stream; - u32 type; u16 col_len[HISTC_NR_COLS]; /* Best would be to reuse the session callchain cursor */ struct callchain_cursor callchain_cursor; @@ -87,6 +84,8 @@ u16 hists__col_len(struct hists *self, enum hist_column col); void hists__set_col_len(struct hists *self, enum hist_column col, u16 len); bool hists__new_col_len(struct hists *self, enum hist_column col, u16 len); +struct perf_evlist; + #ifdef NO_NEWT_SUPPORT static inline int hists__browse(struct hists *self __used, const char *helpline __used, @@ -95,9 +94,8 @@ static inline int hists__browse(struct hists *self __used, return 0; } -static inline int hists__tui_browse_tree(struct rb_root *self __used, - const char *help __used, - int evidx __used) +static inline int hists__tui_browse_tree(struct perf_evlist *evlist __used, + const char *help __used) { return 0; } @@ -118,7 +116,7 @@ int hist_entry__tui_annotate(struct hist_entry *self, int evidx); #define KEY_LEFT NEWT_KEY_LEFT #define KEY_RIGHT NEWT_KEY_RIGHT -int hists__tui_browse_tree(struct rb_root *self, const char *help, int evidx); +int hists__tui_browse_tree(struct perf_evlist *evlist, const char *help); #endif unsigned int hists__sort_list_width(struct hists *self); diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index a3a871f7bda3..0d414199889d 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -7,10 +7,52 @@ #include #include +#include "evlist.h" +#include "evsel.h" #include "session.h" #include "sort.h" #include "util.h" +static int perf_session__read_evlist(struct perf_session *session) +{ + int i, j; + + session->evlist = perf_evlist__new(NULL, NULL); + if (session->evlist == NULL) + return -ENOMEM; + + for (i = 0; i < session->header.attrs; ++i) { + struct perf_header_attr *hattr = session->header.attr[i]; + struct perf_evsel *evsel = perf_evsel__new(&hattr->attr, i); + + if (evsel == NULL) + goto out_delete_evlist; + /* + * Do it before so that if perf_evsel__alloc_id fails, this + * entry gets purged too at perf_evlist__delete(). + */ + perf_evlist__add(session->evlist, evsel); + /* + * We don't have the cpu and thread maps on the header, so + * for allocating the perf_sample_id table we fake 1 cpu and + * hattr->ids threads. + */ + if (perf_evsel__alloc_id(evsel, 1, hattr->ids)) + goto out_delete_evlist; + + for (j = 0; j < hattr->ids; ++j) + perf_evlist__id_hash(session->evlist, evsel, 0, j, + hattr->id[j]); + } + + return 0; + +out_delete_evlist: + perf_evlist__delete(session->evlist); + session->evlist = NULL; + return -ENOMEM; +} + static int perf_session__open(struct perf_session *self, bool force) { struct stat input_stat; @@ -56,6 +98,11 @@ static int perf_session__open(struct perf_session *self, bool force) goto out_close; } + if (perf_session__read_evlist(self) < 0) { + pr_err("Not enough memory to read the event selector list\n"); + goto out_close; + } + self->size = input_stat.st_size; return 0; @@ -141,7 +188,6 @@ struct perf_session *perf_session__new(const char *filename, int mode, memcpy(self->filename, filename, len); self->threads = RB_ROOT; INIT_LIST_HEAD(&self->dead_threads); - self->hists_tree = RB_ROOT; self->last_match = NULL; /* * On 64bit we can mmap the data file in one go. No need for tiny mmap @@ -1137,3 +1183,18 @@ size_t perf_session__fprintf_dsos_buildid(struct perf_session *self, FILE *fp, size_t ret = machine__fprintf_dsos_buildid(&self->host_machine, fp, with_hits); return ret + machines__fprintf_dsos_buildid(&self->machines, fp, with_hits); } + +size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp) +{ + struct perf_evsel *pos; + size_t ret = fprintf(fp, "Aggregated stats:\n"); + + ret += hists__fprintf_nr_events(&session->hists, fp); + + list_for_each_entry(pos, &session->evlist->entries, node) { + ret += fprintf(fp, "%s stats:\n", event_name(pos)); + ret += hists__fprintf_nr_events(&pos->hists, fp); + } + + return ret; +} diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index 977b3a1b14aa..05dd7bcb9453 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -34,12 +34,12 @@ struct perf_session { struct thread *last_match; struct machine host_machine; struct rb_root machines; - struct rb_root hists_tree; + struct perf_evlist *evlist; /* - * FIXME: should point to the first entry in hists_tree and - * be a hists instance. Right now its only 'report' - * that is using ->hists_tree while all the rest use - * ->hists. + * FIXME: Need to split this up further, we need global + * stats + per event stats. 'perf diff' also needs + * to properly support multiple events in a single + * perf.data file. */ struct hists hists; u64 sample_type; @@ -151,11 +151,7 @@ size_t perf_session__fprintf_dsos(struct perf_session *self, FILE *fp); size_t perf_session__fprintf_dsos_buildid(struct perf_session *self, FILE *fp, bool with_hits); -static inline -size_t perf_session__fprintf_nr_events(struct perf_session *self, FILE *fp) -{ - return hists__fprintf_nr_events(&self->hists, fp); -} +size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp); static inline int perf_session__parse_sample(struct perf_session *session, const union perf_event *event, diff --git a/tools/perf/util/ui/browsers/hists.c b/tools/perf/util/ui/browsers/hists.c index c98e6f81d285..f3af4fe5cdc4 100644 --- a/tools/perf/util/ui/browsers/hists.c +++ b/tools/perf/util/ui/browsers/hists.c @@ -7,6 +7,8 @@ #include #include +#include "../../evsel.h" +#include "../../evlist.h" #include "../../hist.h" #include "../../pstack.h" #include "../../sort.h" @@ -987,31 +989,33 @@ out: return key; } -int hists__tui_browse_tree(struct rb_root *self, const char *help, int evidx) +int hists__tui_browse_tree(struct perf_evlist *evlist, const char *help) { - struct rb_node *first = rb_first(self), *nd = first, *next; - int key = 0; + struct perf_evsel *pos; - while (nd) { - struct hists *hists = rb_entry(nd, struct hists, rb_node); - const char *ev_name = __event_name(hists->type, hists->config); + pos = list_entry(evlist->entries.next, struct perf_evsel, node); + while (pos) { + struct hists *hists = &pos->hists; + const char *ev_name = event_name(pos); + int key = hists__browse(hists, help, ev_name, pos->idx); - key = hists__browse(hists, help, ev_name, evidx); switch (key) { case NEWT_KEY_TAB: - next = rb_next(nd); - if (next) - nd = next; + if (pos->node.next == &evlist->entries) + pos = list_entry(evlist->entries.next, struct perf_evsel, node); + else + pos = list_entry(pos->node.next, struct perf_evsel, node); break; case NEWT_KEY_UNTAB: - if (nd == first) - continue; - nd = rb_prev(nd); + if (pos->node.prev == &evlist->entries) + pos = list_entry(evlist->entries.prev, struct perf_evsel, node); + else + pos = list_entry(pos->node.prev, struct perf_evsel, node); break; default: return key; } } - return key; + return 0; } From 7f0030b211579939461468f25b80c73e293c46e0 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 6 Mar 2011 13:07:30 -0300 Subject: [PATCH 556/706] perf report tui: Improve multi event session support When multiple events were used in 'perf record', allow the user to choose which one is wanted before showing the per event histograms. Annotations will be performed on the chosen event. Allow going back and forth from event to event quickly using just the arrow keys and enter. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi Cc: William Cohen LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 7 +- tools/perf/util/hist.h | 16 +-- tools/perf/util/ui/browsers/hists.c | 151 ++++++++++++++++++++++++---- 3 files changed, 142 insertions(+), 32 deletions(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 1c399eae5f7b..e9b5d513333a 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -222,7 +222,8 @@ static size_t hists__fprintf_nr_sample_events(struct hists *self, return ret + fprintf(fp, "\n#\n"); } -static int hists__tty_browse_tree(struct perf_evlist *evlist, const char *help) +static int perf_evlist__tty_browse_hists(struct perf_evlist *evlist, + const char *help) { struct perf_evsel *pos; @@ -304,9 +305,9 @@ static int __cmd_report(void) } if (use_browser > 0) - hists__tui_browse_tree(session->evlist, help); + perf_evlist__tui_browse_hists(session->evlist, help); else - hists__tty_browse_tree(session->evlist, help); + perf_evlist__tty_browse_hists(session->evlist, help); out_delete: /* diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 0d38b435827b..cb6858a2f9a3 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -87,15 +87,9 @@ bool hists__new_col_len(struct hists *self, enum hist_column col, u16 len); struct perf_evlist; #ifdef NO_NEWT_SUPPORT -static inline int hists__browse(struct hists *self __used, - const char *helpline __used, - const char *ev_name __used, int evidx __used) -{ - return 0; -} - -static inline int hists__tui_browse_tree(struct perf_evlist *evlist __used, - const char *help __used) +static inline +int perf_evlist__tui_browse_hists(struct perf_evlist *evlist __used, + const char *help __used) { return 0; } @@ -109,14 +103,12 @@ static inline int hist_entry__tui_annotate(struct hist_entry *self __used, #define KEY_RIGHT -2 #else #include -int hists__browse(struct hists *self, const char *helpline, - const char *ev_name, int evidx); int hist_entry__tui_annotate(struct hist_entry *self, int evidx); #define KEY_LEFT NEWT_KEY_LEFT #define KEY_RIGHT NEWT_KEY_RIGHT -int hists__tui_browse_tree(struct perf_evlist *evlist, const char *help); +int perf_evlist__tui_browse_hists(struct perf_evlist *evlist, const char *help); #endif unsigned int hists__sort_list_width(struct hists *self); diff --git a/tools/perf/util/ui/browsers/hists.c b/tools/perf/util/ui/browsers/hists.c index f3af4fe5cdc4..798efdca3ead 100644 --- a/tools/perf/util/ui/browsers/hists.c +++ b/tools/perf/util/ui/browsers/hists.c @@ -803,9 +803,11 @@ static int hists__browser_title(struct hists *self, char *bf, size_t size, return printed; } -int hists__browse(struct hists *self, const char *helpline, - const char *ev_name, int evidx) +static int perf_evsel__hists_browse(struct perf_evsel *evsel, + const char *helpline, const char *ev_name, + bool left_exits) { + struct hists *self = &evsel->hists; struct hist_browser *browser = hist_browser__new(self); struct pstack *fstack; const struct thread *thread_filter = NULL; @@ -878,8 +880,14 @@ int hists__browse(struct hists *self, const char *helpline, case NEWT_KEY_LEFT: { const void *top; - if (pstack__empty(fstack)) + if (pstack__empty(fstack)) { + /* + * Go back to the perf_evsel_menu__run or other user + */ + if (left_exits) + goto out_free_stack; continue; + } top = pstack__pop(fstack); if (top == &dso_filter) goto zoom_out_dso; @@ -888,7 +896,8 @@ int hists__browse(struct hists *self, const char *helpline, continue; } case NEWT_KEY_ESCAPE: - if (!ui__dialog_yesno("Do you really want to exit?")) + if (!left_exits && + !ui__dialog_yesno("Do you really want to exit?")) continue; /* Fall thru */ default: @@ -940,7 +949,7 @@ do_annotate: if (he == NULL) continue; - hist_entry__tui_annotate(he, evidx); + hist_entry__tui_annotate(he, evsel->idx); } else if (choice == browse_map) map__browse(browser->selection->map); else if (choice == zoom_dso) { @@ -989,15 +998,71 @@ out: return key; } -int hists__tui_browse_tree(struct perf_evlist *evlist, const char *help) -{ - struct perf_evsel *pos; +struct perf_evsel_menu { + struct ui_browser b; + struct perf_evsel *selection; +}; - pos = list_entry(evlist->entries.next, struct perf_evsel, node); - while (pos) { - struct hists *hists = &pos->hists; - const char *ev_name = event_name(pos); - int key = hists__browse(hists, help, ev_name, pos->idx); +static void perf_evsel_menu__write(struct ui_browser *browser, + void *entry, int row) +{ + struct perf_evsel_menu *menu = container_of(browser, + struct perf_evsel_menu, b); + struct perf_evsel *evsel = list_entry(entry, struct perf_evsel, node); + bool current_entry = ui_browser__is_current_entry(browser, row); + unsigned long nr_events = evsel->hists.stats.nr_events[PERF_RECORD_SAMPLE]; + const char *ev_name = event_name(evsel); + char bf[256], unit; + + ui_browser__set_color(browser, current_entry ? HE_COLORSET_SELECTED : + HE_COLORSET_NORMAL); + + nr_events = convert_unit(nr_events, &unit); + snprintf(bf, sizeof(bf), "%lu%c%s%s", nr_events, + unit, unit == ' ' ? "" : " ", ev_name); + slsmg_write_nstring(bf, browser->width); + + if (current_entry) + menu->selection = evsel; +} + +static int perf_evsel_menu__run(struct perf_evsel_menu *menu, const char *help) +{ + int exit_keys[] = { NEWT_KEY_ENTER, NEWT_KEY_RIGHT, 0, }; + struct perf_evlist *evlist = menu->b.priv; + struct perf_evsel *pos; + const char *ev_name, *title = "Available samples"; + int key; + + if (ui_browser__show(&menu->b, title, + "ESC: exit, ENTER|->: Browse histograms") < 0) + return -1; + + ui_browser__add_exit_keys(&menu->b, exit_keys); + + while (1) { + key = ui_browser__run(&menu->b); + + switch (key) { + case NEWT_KEY_RIGHT: + case NEWT_KEY_ENTER: + if (!menu->selection) + continue; + pos = menu->selection; +browse_hists: + ev_name = event_name(pos); + key = perf_evsel__hists_browse(pos, help, ev_name, true); + ui_browser__show_title(&menu->b, title); + break; + case NEWT_KEY_LEFT: + continue; + case NEWT_KEY_ESCAPE: + if (!ui__dialog_yesno("Do you really want to exit?")) + continue; + /* Fall thru */ + default: + goto out; + } switch (key) { case NEWT_KEY_TAB: @@ -1005,17 +1070,69 @@ int hists__tui_browse_tree(struct perf_evlist *evlist, const char *help) pos = list_entry(evlist->entries.next, struct perf_evsel, node); else pos = list_entry(pos->node.next, struct perf_evsel, node); - break; + goto browse_hists; case NEWT_KEY_UNTAB: if (pos->node.prev == &evlist->entries) pos = list_entry(evlist->entries.prev, struct perf_evsel, node); else pos = list_entry(pos->node.prev, struct perf_evsel, node); - break; + goto browse_hists; + case 'q': + case CTRL('c'): + goto out; default: - return key; + break; } } - return 0; +out: + ui_browser__hide(&menu->b); + return key; +} + +static int __perf_evlist__tui_browse_hists(struct perf_evlist *evlist, + const char *help) +{ + struct perf_evsel *pos; + struct perf_evsel_menu menu = { + .b = { + .entries = &evlist->entries, + .refresh = ui_browser__list_head_refresh, + .seek = ui_browser__list_head_seek, + .write = perf_evsel_menu__write, + .nr_entries = evlist->nr_entries, + .priv = evlist, + }, + }; + + ui_helpline__push("Press ESC to exit"); + + list_for_each_entry(pos, &evlist->entries, node) { + const char *ev_name = event_name(pos); + size_t line_len = strlen(ev_name) + 7; + + if (menu.b.width < line_len) + menu.b.width = line_len; + /* + * Cache the evsel name, tracepoints have a _high_ cost per + * event_name() call. + */ + if (pos->name == NULL) + pos->name = strdup(ev_name); + } + + return perf_evsel_menu__run(&menu, help); +} + +int perf_evlist__tui_browse_hists(struct perf_evlist *evlist, const char *help) +{ + + if (evlist->nr_entries == 1) { + struct perf_evsel *first = list_entry(evlist->entries.next, + struct perf_evsel, node); + const char *ev_name = event_name(first); + return perf_evsel__hists_browse(first, help, ev_name, false); + } + + return __perf_evlist__tui_browse_hists(evlist, help); } From 997772884036e6e121de39322179989154437d9f Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Mon, 7 Mar 2011 09:58:33 +0100 Subject: [PATCH 557/706] debugobjects: Add hint for better object identification In complex subsystems like mac80211 structures can contain several timers and work structs, so identifying a specific instance from the call trace and object type output of debugobjects can be hard. Allow the subsystems which support debugobjects to provide a hint function. This function returns a pointer to a kernel address (preferrably the objects callback function) which is printed along with the debugobjects type. Add hint methods for timer_list, work_struct and hrtimer. [ tglx: Massaged changelog, made it compile ] Signed-off-by: Stanislaw Gruszka LKML-Reference: <20110307085809.GA9334@redhat.com> Signed-off-by: Thomas Gleixner --- include/linux/debugobjects.h | 5 ++++- kernel/hrtimer.c | 6 ++++++ kernel/timer.c | 6 ++++++ kernel/workqueue.c | 6 ++++++ lib/debugobjects.c | 9 ++++++--- 5 files changed, 28 insertions(+), 4 deletions(-) diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h index 597692f1fc8d..65970b811e22 100644 --- a/include/linux/debugobjects.h +++ b/include/linux/debugobjects.h @@ -34,7 +34,10 @@ struct debug_obj { /** * struct debug_obj_descr - object type specific debug description structure + * * @name: name of the object typee + * @debug_hint: function returning address, which have associated + * kernel symbol, to allow identify the object * @fixup_init: fixup function, which is called when the init check * fails * @fixup_activate: fixup function, which is called when the activate check @@ -46,7 +49,7 @@ struct debug_obj { */ struct debug_obj_descr { const char *name; - + void *(*debug_hint) (void *addr); int (*fixup_init) (void *addr, enum debug_obj_state state); int (*fixup_activate) (void *addr, enum debug_obj_state state); int (*fixup_destroy) (void *addr, enum debug_obj_state state); diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 0c8d7c048615..e38f5a073d01 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -334,6 +334,11 @@ EXPORT_SYMBOL_GPL(ktime_add_safe); static struct debug_obj_descr hrtimer_debug_descr; +static void *hrtimer_debug_hint(void *addr) +{ + return ((struct hrtimer *) addr)->function; +} + /* * fixup_init is called when: * - an active object is initialized @@ -393,6 +398,7 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state) static struct debug_obj_descr hrtimer_debug_descr = { .name = "hrtimer", + .debug_hint = hrtimer_debug_hint, .fixup_init = hrtimer_fixup_init, .fixup_activate = hrtimer_fixup_activate, .fixup_free = hrtimer_fixup_free, diff --git a/kernel/timer.c b/kernel/timer.c index d6459923d245..33a67925d900 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -404,6 +404,11 @@ static void timer_stats_account_timer(struct timer_list *timer) {} static struct debug_obj_descr timer_debug_descr; +static void *timer_debug_hint(void *addr) +{ + return ((struct timer_list *) addr)->function; +} + /* * fixup_init is called when: * - an active object is initialized @@ -477,6 +482,7 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state) static struct debug_obj_descr timer_debug_descr = { .name = "timer_list", + .debug_hint = timer_debug_hint, .fixup_init = timer_fixup_init, .fixup_activate = timer_fixup_activate, .fixup_free = timer_fixup_free, diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ee6578b578ad..b5fe4c00eb3c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -316,6 +316,11 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, static struct debug_obj_descr work_debug_descr; +static void *work_debug_hint(void *addr) +{ + return ((struct work_struct *) addr)->func; +} + /* * fixup_init is called when: * - an active object is initialized @@ -387,6 +392,7 @@ static int work_fixup_free(void *addr, enum debug_obj_state state) static struct debug_obj_descr work_debug_descr = { .name = "work_struct", + .debug_hint = work_debug_hint, .fixup_init = work_fixup_init, .fixup_activate = work_fixup_activate, .fixup_free = work_fixup_free, diff --git a/lib/debugobjects.c b/lib/debugobjects.c index deebcc57d4e6..9d86e45086f5 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -249,14 +249,17 @@ static struct debug_bucket *get_bucket(unsigned long addr) static void debug_print_object(struct debug_obj *obj, char *msg) { + struct debug_obj_descr *descr = obj->descr; static int limit; - if (limit < 5 && obj->descr != descr_test) { + if (limit < 5 && descr != descr_test) { + void *hint = descr->debug_hint ? + descr->debug_hint(obj->object) : NULL; limit++; WARN(1, KERN_ERR "ODEBUG: %s %s (active state %u) " - "object type: %s\n", + "object type: %s hint: %pS\n", msg, obj_states[obj->state], obj->astate, - obj->descr->name); + descr->name, hint); } debug_objects_warnings++; } From ea7145477a461e09d8d194cac4b996dc4f449107 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 7 Mar 2011 19:10:39 +0100 Subject: [PATCH 558/706] x86: Separate out entry text section Put x86 entry code into a separate link section: .entry.text. Separating the entry text section seems to have performance benefits - caused by more efficient instruction cache usage. Running hackbench with perf stat --repeat showed that the change compresses the icache footprint. The icache load miss rate went down by about 15%: before patch: 19417627 L1-icache-load-misses ( +- 0.147% ) after patch: 16490788 L1-icache-load-misses ( +- 0.180% ) The motivation of the patch was to fix a particular kprobes bug that relates to the entry text section, the performance advantage was discovered accidentally. Whole perf output follows: - results for current tip tree: Performance counter stats for './hackbench/hackbench 10' (500 runs): 19417627 L1-icache-load-misses ( +- 0.147% ) 2676914223 instructions # 0.497 IPC ( +- 0.079% ) 5389516026 cycles ( +- 0.144% ) 0.206267711 seconds time elapsed ( +- 0.138% ) - results for current tip tree with the patch applied: Performance counter stats for './hackbench/hackbench 10' (500 runs): 16490788 L1-icache-load-misses ( +- 0.180% ) 2717734941 instructions # 0.502 IPC ( +- 0.079% ) 5414756975 cycles ( +- 0.148% ) 0.206747566 seconds time elapsed ( +- 0.137% ) Signed-off-by: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Nick Piggin Cc: Eric Dumazet Cc: masami.hiramatsu.pt@hitachi.com Cc: ananth@in.ibm.com Cc: davem@davemloft.net Cc: 2nddept-manager@sdl.hitachi.co.jp LKML-Reference: <20110307181039.GB15197@jolsa.redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32entry.S | 2 ++ arch/x86/kernel/entry_32.S | 6 ++++-- arch/x86/kernel/entry_64.S | 6 ++++-- arch/x86/kernel/vmlinux.lds.S | 1 + include/asm-generic/sections.h | 1 + include/asm-generic/vmlinux.lds.h | 6 ++++++ 6 files changed, 18 insertions(+), 4 deletions(-) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 518bb99c3394..f729b2e9679c 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -25,6 +25,8 @@ #define sysretl_audit ia32_ret_from_sys_call #endif + .section .entry.text, "ax" + #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) .macro IA32_ARG_FIXUP noebp=0 diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index c8b4efad7ebb..f5accf8eaa78 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -65,6 +65,8 @@ #define sysexit_audit syscall_exit_work #endif + .section .entry.text, "ax" + /* * We use macros for low-level operations which need to be overridden * for paravirtualization. The following will never clobber any registers: @@ -788,7 +790,7 @@ ENDPROC(ptregs_clone) */ .section .init.rodata,"a" ENTRY(interrupt) -.text +.section .entry.text, "ax" .p2align 5 .p2align CONFIG_X86_L1_CACHE_SHIFT ENTRY(irq_entries_start) @@ -807,7 +809,7 @@ vector=FIRST_EXTERNAL_VECTOR .endif .previous .long 1b - .text + .section .entry.text, "ax" vector=vector+1 .endif .endr diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index aed1ffbeb0c9..0a0ed794edb2 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -61,6 +61,8 @@ #define __AUDIT_ARCH_LE 0x40000000 .code64 + .section .entry.text, "ax" + #ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE ENTRY(mcount) @@ -744,7 +746,7 @@ END(stub_rt_sigreturn) */ .section .init.rodata,"a" ENTRY(interrupt) - .text + .section .entry.text .p2align 5 .p2align CONFIG_X86_L1_CACHE_SHIFT ENTRY(irq_entries_start) @@ -763,7 +765,7 @@ vector=FIRST_EXTERNAL_VECTOR .endif .previous .quad 1b - .text + .section .entry.text vector=vector+1 .endif .endr diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index bf4700755184..6d4341d5c52a 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -105,6 +105,7 @@ SECTIONS SCHED_TEXT LOCK_TEXT KPROBES_TEXT + ENTRY_TEXT IRQENTRY_TEXT *(.fixup) *(.gnu.warning) diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index b3bfabc258f3..c1a1216e29ce 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -11,6 +11,7 @@ extern char _sinittext[], _einittext[]; extern char _end[]; extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[]; extern char __kprobes_text_start[], __kprobes_text_end[]; +extern char __entry_text_start[], __entry_text_end[]; extern char __initdata_begin[], __initdata_end[]; extern char __start_rodata[], __end_rodata[]; diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index fe77e3395b40..906c3ceca9a2 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -424,6 +424,12 @@ *(.kprobes.text) \ VMLINUX_SYMBOL(__kprobes_text_end) = .; +#define ENTRY_TEXT \ + ALIGN_FUNCTION(); \ + VMLINUX_SYMBOL(__entry_text_start) = .; \ + *(.entry.text) \ + VMLINUX_SYMBOL(__entry_text_end) = .; + #ifdef CONFIG_FUNCTION_GRAPH_TRACER #define IRQENTRY_TEXT \ ALIGN_FUNCTION(); \ From 2a8247a2600c3e087a568fc68a6ec4eedac27ef1 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 21 Feb 2011 15:25:13 +0100 Subject: [PATCH 559/706] kprobes: Disabling optimized kprobes for entry text section You can crash the kernel (with root/admin privileges) using kprobe tracer by running: echo "p system_call_after_swapgs" > ./kprobe_events echo 1 > ./events/kprobes/enable The reason is that at the system_call_after_swapgs label, the kernel stack is not set up. If optimized kprobes are enabled, the user space stack is being used in this case (see optimized kprobe template) and this might result in a crash. There are several places like this over the entry code (entry_$BIT). As it seems there's no any reasonable/maintainable way to disable only those places where the stack is not ready, I switched off the whole entry code from kprobe optimizing. Signed-off-by: Jiri Olsa Acked-by: Masami Hiramatsu Cc: acme@redhat.com Cc: fweisbec@gmail.com Cc: ananth@in.ibm.com Cc: davem@davemloft.net Cc: a.p.zijlstra@chello.nl Cc: eric.dumazet@gmail.com Cc: 2nddept-manager@sdl.hitachi.co.jp LKML-Reference: <1298298313-5980-3-git-send-email-jolsa@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index d91c477b3f62..c969fd9d1566 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -1276,6 +1276,14 @@ static int __kprobes can_optimize(unsigned long paddr) if (!kallsyms_lookup_size_offset(paddr, &size, &offset)) return 0; + /* + * Do not optimize in the entry code due to the unstable + * stack handling. + */ + if ((paddr >= (unsigned long )__entry_text_start) && + (paddr < (unsigned long )__entry_text_end)) + return 0; + /* Check there is enough space for a relative jump. */ if (size - offset < RELATIVEJUMP_SIZE) return 0; From c68fd4f3ca90de7d18c567e70b2c164078aefadf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Mar 2011 19:52:55 +0100 Subject: [PATCH 560/706] genirq: Add comments to Kconfig switches Signed-off-by: Thomas Gleixner Cc: Sam Ravnborg --- kernel/irq/Kconfig | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 144db9dcfcde..09bef82d74cb 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -1,3 +1,4 @@ +# Select this to activate the generic irq options below config HAVE_GENERIC_HARDIRQS bool @@ -17,27 +18,36 @@ config GENERIC_HARDIRQS_NO_COMPAT bool # Options selectable by the architecture code + +# Make sparse irq Kconfig switch below available config HAVE_SPARSE_IRQ bool +# Enable the generic irq autoprobe mechanism config GENERIC_IRQ_PROBE bool +# Use the generic /proc/interrupts implementation config GENERIC_IRQ_SHOW bool +# Support for delayed migration from interrupt context config GENERIC_PENDING_IRQ bool +# Alpha specific irq affinity mechanism config AUTO_IRQ_AFFINITY bool +# Tasklet based software resend for pending interrupts on enable_irq() config HARDIRQS_SW_RESEND bool +# Preflow handler support for fasteoi (sparc64) config IRQ_PREFLOW_FASTEOI bool +# Support forced irq threading config IRQ_FORCED_THREADING bool From 51de69523ffe1c17994dc2f260369f29dfdce71c Mon Sep 17 00:00:00 2001 From: Owen Smith Date: Wed, 22 Dec 2010 15:05:00 +0000 Subject: [PATCH 561/706] xen: Union the blkif_request request specific fields Prepare for extending the block device ring to allow request specific fields, by moving the request specific fields for reads, writes and barrier requests to a union member. Acked-by: Jens Axboe Signed-off-by: Owen Smith Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkfront.c | 8 ++++---- include/xen/interface/io/blkif.h | 16 +++++++++++----- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index d7aa39e349a6..cc4514c9d8a6 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -281,7 +281,7 @@ static int blkif_queue_request(struct request *req) info->shadow[id].request = req; ring_req->id = id; - ring_req->sector_number = (blkif_sector_t)blk_rq_pos(req); + ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req); ring_req->handle = info->handle; ring_req->operation = rq_data_dir(req) ? @@ -317,7 +317,7 @@ static int blkif_queue_request(struct request *req) rq_data_dir(req) ); info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn); - ring_req->seg[i] = + ring_req->u.rw.seg[i] = (struct blkif_request_segment) { .gref = ref, .first_sect = fsect, @@ -615,7 +615,7 @@ static void blkif_completion(struct blk_shadow *s) { int i; for (i = 0; i < s->req.nr_segments; i++) - gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL); + gnttab_end_foreign_access(s->req.u.rw.seg[i].gref, 0, 0UL); } static irqreturn_t blkif_interrupt(int irq, void *dev_id) @@ -932,7 +932,7 @@ static int blkif_recover(struct blkfront_info *info) /* Rewrite any grant references invalidated by susp/resume. */ for (j = 0; j < req->nr_segments; j++) gnttab_grant_foreign_access_ref( - req->seg[j].gref, + req->u.rw.seg[j].gref, info->xbdev->otherend_id, pfn_to_mfn(info->shadow[req->id].frame[j]), rq_data_dir(info->shadow[req->id].request)); diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h index c2d1fa4dc1ee..e4f743cfa151 100644 --- a/include/xen/interface/io/blkif.h +++ b/include/xen/interface/io/blkif.h @@ -51,11 +51,7 @@ typedef uint64_t blkif_sector_t; */ #define BLKIF_MAX_SEGMENTS_PER_REQUEST 11 -struct blkif_request { - uint8_t operation; /* BLKIF_OP_??? */ - uint8_t nr_segments; /* number of segments */ - blkif_vdev_t handle; /* only for read/write requests */ - uint64_t id; /* private guest value, echoed in resp */ +struct blkif_request_rw { blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ struct blkif_request_segment { grant_ref_t gref; /* reference to I/O buffer frame */ @@ -65,6 +61,16 @@ struct blkif_request { } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; }; +struct blkif_request { + uint8_t operation; /* BLKIF_OP_??? */ + uint8_t nr_segments; /* number of segments */ + blkif_vdev_t handle; /* only for read/write requests */ + uint64_t id; /* private guest value, echoed in resp */ + union { + struct blkif_request_rw rw; + } u; +}; + struct blkif_response { uint64_t id; /* copied from request */ uint8_t operation; /* copied from request */ From c49aa5bd1376939b40759a6da5ba6cf701702721 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 8 Mar 2011 09:24:26 +0000 Subject: [PATCH 562/706] x86: Remove dead config option X86_CPU This isn't being referenced anywhere, and the selects done from it can be easily done together with all the other X86 ones. v2: Also adjust UML's Kconfig.x86. Signed-off-by: Jan Beulich LKML-Reference: <4D7603DA02000078000351C1@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/um/Kconfig.x86 | 2 ++ arch/x86/Kconfig | 2 ++ arch/x86/Kconfig.cpu | 5 ----- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/um/Kconfig.x86 b/arch/um/Kconfig.x86 index 62f21156aaa4..02fb017fed47 100644 --- a/arch/um/Kconfig.x86 +++ b/arch/um/Kconfig.x86 @@ -10,6 +10,8 @@ endmenu config UML_X86 def_bool y + select GENERIC_FIND_FIRST_BIT + select GENERIC_FIND_NEXT_BIT config 64BIT bool diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d5ed94d30aad..43214e303294 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -64,6 +64,8 @@ config X86 select HAVE_TEXT_POKE_SMP select HAVE_GENERIC_HARDIRQS select HAVE_SPARSE_IRQ + select GENERIC_FIND_FIRST_BIT + select GENERIC_FIND_NEXT_BIT select GENERIC_IRQ_PROBE select GENERIC_PENDING_IRQ if SMP select USE_GENERIC_SMP_HELPERS if SMP diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 283c5a6a03a6..ed47e6e1747f 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -294,11 +294,6 @@ config X86_GENERIC endif -config X86_CPU - def_bool y - select GENERIC_FIND_FIRST_BIT - select GENERIC_FIND_NEXT_BIT - # # Define implied options from the CPU selection here config X86_INTERNODE_CACHE_SHIFT From 750912fa366312e9c5bc83eab352898a26750401 Mon Sep 17 00:00:00 2001 From: David Sharp Date: Wed, 8 Dec 2010 13:46:47 -0800 Subject: [PATCH 563/706] tracing: Add an 'overwrite' trace_option. Add an "overwrite" trace_option for ftrace to control whether the buffer should be overwritten on overflow or not. The default remains to overwrite old events when the buffer is full. This patch adds the option to instead discard newest events when the buffer is full. This is useful to get a snapshot of traces just after enabling traces. Dropping the current event is also a simpler code path. Signed-off-by: David Sharp LKML-Reference: <1291844807-15481-1-git-send-email-dhsharp@google.com> Signed-off-by: Steven Rostedt --- Documentation/trace/ftrace.txt | 5 +++++ include/linux/ring_buffer.h | 2 ++ kernel/trace/ring_buffer.c | 11 +++++++++++ kernel/trace/trace.c | 17 +++++++++++------ kernel/trace/trace.h | 1 + 5 files changed, 30 insertions(+), 6 deletions(-) diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index 67f1cc473257..1ebc24cf9a55 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -454,6 +454,11 @@ x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6] latencies, as described in "Latency trace format". + overwrite - This controls what happens when the trace buffer is + full. If "1" (default), the oldest events are + discarded and overwritten. If "0", then the newest + events are discarded. + ftrace_enabled -------------- diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 8d3a2486544d..ab38ac80b0f9 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -100,6 +100,8 @@ void ring_buffer_free(struct ring_buffer *buffer); int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size); +void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val); + struct ring_buffer_event *ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length); int ring_buffer_unlock_commit(struct ring_buffer *buffer, diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bd1c35a4fbcc..269db80a961e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1429,6 +1429,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) } EXPORT_SYMBOL_GPL(ring_buffer_resize); +void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val) +{ + mutex_lock(&buffer->mutex); + if (val) + buffer->flags |= RB_FL_OVERWRITE; + else + buffer->flags &= ~RB_FL_OVERWRITE; + mutex_unlock(&buffer->mutex); +} +EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite); + static inline void * __rb_data_page_index(struct buffer_data_page *bpage, unsigned index) { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8dc8da6733f9..85e3ee1e474e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -41,8 +41,6 @@ #include "trace.h" #include "trace_output.h" -#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) - /* * On boot up, the ring buffer is set to the minimum size, so that * we do not waste memory on systems that are not using tracing. @@ -340,7 +338,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | - TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD; + TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; static int trace_stop_count; static DEFINE_SPINLOCK(tracing_start_lock); @@ -425,6 +423,7 @@ static const char *trace_options[] = { "sleep-time", "graph-time", "record-cmd", + "overwrite", NULL }; @@ -2529,6 +2528,9 @@ static void set_tracer_flags(unsigned int mask, int enabled) if (mask == TRACE_ITER_RECORD_CMD) trace_event_enable_cmd_record(enabled); + + if (mask == TRACE_ITER_OVERWRITE) + ring_buffer_change_overwrite(global_trace.buffer, enabled); } static ssize_t @@ -4555,9 +4557,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) __init static int tracer_alloc_buffers(void) { int ring_buf_size; + enum ring_buffer_flags rb_flags; int i; int ret = -ENOMEM; + if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) goto out; @@ -4570,12 +4574,13 @@ __init static int tracer_alloc_buffers(void) else ring_buf_size = 1; + rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; + cpumask_copy(tracing_buffer_mask, cpu_possible_mask); cpumask_copy(tracing_cpumask, cpu_all_mask); /* TODO: make the number of buffers hot pluggable with CPUS */ - global_trace.buffer = ring_buffer_alloc(ring_buf_size, - TRACE_BUFFER_FLAGS); + global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags); if (!global_trace.buffer) { printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); WARN_ON(1); @@ -4585,7 +4590,7 @@ __init static int tracer_alloc_buffers(void) #ifdef CONFIG_TRACER_MAX_TRACE - max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS); + max_tr.buffer = ring_buffer_alloc(1, rb_flags); if (!max_tr.buffer) { printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); WARN_ON(1); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 856e73cc1d3f..951d0b7e7062 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -606,6 +606,7 @@ enum trace_iterator_flags { TRACE_ITER_SLEEP_TIME = 0x40000, TRACE_ITER_GRAPH_TIME = 0x80000, TRACE_ITER_RECORD_CMD = 0x100000, + TRACE_ITER_OVERWRITE = 0x200000, }; /* From de29be5e712dc8b7eef2bef9417af3bb6a88e47a Mon Sep 17 00:00:00 2001 From: David Sharp Date: Fri, 3 Dec 2010 16:13:16 -0800 Subject: [PATCH 564/706] ring-buffer: Remove unused #include Signed-off-by: David Sharp LKML-Reference: <1291421609-14665-3-git-send-email-dhsharp@google.com> Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 269db80a961e..3237d961d905 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5,7 +5,6 @@ */ #include #include -#include #include #include #include From f44f7f96a20af16f6f12e1c995576d6becf5f57b Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 21 Feb 2011 22:58:51 -0800 Subject: [PATCH 565/706] RTC: Initialize kernel state from RTC Mark Brown pointed out a corner case: that RTC alarms should be allowed to be persistent across reboots if the hardware supported it. The rework of the generic layer to virtualize the RTC alarm virtualized much of the alarm handling, and removed the code used to read the alarm time from the hardware. Mark noted if we want the alarm to be persistent across reboots, we need to re-read the alarm value into the virtualized generic layer at boot up, so that the generic layer properly exposes that value. This patch restores much of the earlier removed rtc_read_alarm code and wires it in so that we set the kernel's alarm value to what we find in the hardware at boot time. NOTE: Not all hardware supports persistent RTC alarm state across system reset. rtc-cmos for example will keep the alarm time, but disables the AIE mode irq. Applications should not expect the RTC alarm to be valid after a system reset. We will preserve what we can, to represent the hardware state at boot, but its not guarenteed. Further, in the future, with multiplexed RTC alarms, the soonest alarm to fire may not be the one set via the /dev/rt ioctls. So an application may set the alarm with RTC_ALM_SET, but after a reset find that RTC_ALM_READ returns an earlier time. Again, we preserve what we can, but applications should not expect the RTC alarm state to persist across a system reset. Big thanks to Mark for pointing out the issue! Thanks also to Marcelo for helping think through the solution. CC: Mark Brown CC: Marcelo Roberto Jimenez CC: Thomas Gleixner CC: Alessandro Zummo CC: rtc-linux@googlegroups.com Reported-by: Mark Brown Signed-off-by: John Stultz --- drivers/rtc/class.c | 7 ++ drivers/rtc/interface.c | 180 ++++++++++++++++++++++++++++++++++++++++ include/linux/rtc.h | 1 + 3 files changed, 188 insertions(+) diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c index c404b61386bf..09b4437b3e61 100644 --- a/drivers/rtc/class.c +++ b/drivers/rtc/class.c @@ -117,6 +117,7 @@ struct rtc_device *rtc_device_register(const char *name, struct device *dev, struct module *owner) { struct rtc_device *rtc; + struct rtc_wkalrm alrm; int id, err; if (idr_pre_get(&rtc_idr, GFP_KERNEL) == 0) { @@ -166,6 +167,12 @@ struct rtc_device *rtc_device_register(const char *name, struct device *dev, rtc->pie_timer.function = rtc_pie_update_irq; rtc->pie_enabled = 0; + /* Check to see if there is an ALARM already set in hw */ + err = __rtc_read_alarm(rtc, &alrm); + + if (!err && !rtc_valid_tm(&alrm.time)) + rtc_set_alarm(rtc, &alrm); + strlcpy(rtc->name, name, RTC_DEVICE_NAME_SIZE); dev_set_name(&rtc->dev, "rtc%d", id); diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c index cb2f0728fd70..8ec6b069a7f5 100644 --- a/drivers/rtc/interface.c +++ b/drivers/rtc/interface.c @@ -116,6 +116,186 @@ int rtc_set_mmss(struct rtc_device *rtc, unsigned long secs) } EXPORT_SYMBOL_GPL(rtc_set_mmss); +static int rtc_read_alarm_internal(struct rtc_device *rtc, struct rtc_wkalrm *alarm) +{ + int err; + + err = mutex_lock_interruptible(&rtc->ops_lock); + if (err) + return err; + + if (rtc->ops == NULL) + err = -ENODEV; + else if (!rtc->ops->read_alarm) + err = -EINVAL; + else { + memset(alarm, 0, sizeof(struct rtc_wkalrm)); + err = rtc->ops->read_alarm(rtc->dev.parent, alarm); + } + + mutex_unlock(&rtc->ops_lock); + return err; +} + +int __rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) +{ + int err; + struct rtc_time before, now; + int first_time = 1; + unsigned long t_now, t_alm; + enum { none, day, month, year } missing = none; + unsigned days; + + /* The lower level RTC driver may return -1 in some fields, + * creating invalid alarm->time values, for reasons like: + * + * - The hardware may not be capable of filling them in; + * many alarms match only on time-of-day fields, not + * day/month/year calendar data. + * + * - Some hardware uses illegal values as "wildcard" match + * values, which non-Linux firmware (like a BIOS) may try + * to set up as e.g. "alarm 15 minutes after each hour". + * Linux uses only oneshot alarms. + * + * When we see that here, we deal with it by using values from + * a current RTC timestamp for any missing (-1) values. The + * RTC driver prevents "periodic alarm" modes. + * + * But this can be racey, because some fields of the RTC timestamp + * may have wrapped in the interval since we read the RTC alarm, + * which would lead to us inserting inconsistent values in place + * of the -1 fields. + * + * Reading the alarm and timestamp in the reverse sequence + * would have the same race condition, and not solve the issue. + * + * So, we must first read the RTC timestamp, + * then read the RTC alarm value, + * and then read a second RTC timestamp. + * + * If any fields of the second timestamp have changed + * when compared with the first timestamp, then we know + * our timestamp may be inconsistent with that used by + * the low-level rtc_read_alarm_internal() function. + * + * So, when the two timestamps disagree, we just loop and do + * the process again to get a fully consistent set of values. + * + * This could all instead be done in the lower level driver, + * but since more than one lower level RTC implementation needs it, + * then it's probably best best to do it here instead of there.. + */ + + /* Get the "before" timestamp */ + err = rtc_read_time(rtc, &before); + if (err < 0) + return err; + do { + if (!first_time) + memcpy(&before, &now, sizeof(struct rtc_time)); + first_time = 0; + + /* get the RTC alarm values, which may be incomplete */ + err = rtc_read_alarm_internal(rtc, alarm); + if (err) + return err; + + /* full-function RTCs won't have such missing fields */ + if (rtc_valid_tm(&alarm->time) == 0) + return 0; + + /* get the "after" timestamp, to detect wrapped fields */ + err = rtc_read_time(rtc, &now); + if (err < 0) + return err; + + /* note that tm_sec is a "don't care" value here: */ + } while ( before.tm_min != now.tm_min + || before.tm_hour != now.tm_hour + || before.tm_mon != now.tm_mon + || before.tm_year != now.tm_year); + + /* Fill in the missing alarm fields using the timestamp; we + * know there's at least one since alarm->time is invalid. + */ + if (alarm->time.tm_sec == -1) + alarm->time.tm_sec = now.tm_sec; + if (alarm->time.tm_min == -1) + alarm->time.tm_min = now.tm_min; + if (alarm->time.tm_hour == -1) + alarm->time.tm_hour = now.tm_hour; + + /* For simplicity, only support date rollover for now */ + if (alarm->time.tm_mday == -1) { + alarm->time.tm_mday = now.tm_mday; + missing = day; + } + if (alarm->time.tm_mon == -1) { + alarm->time.tm_mon = now.tm_mon; + if (missing == none) + missing = month; + } + if (alarm->time.tm_year == -1) { + alarm->time.tm_year = now.tm_year; + if (missing == none) + missing = year; + } + + /* with luck, no rollover is needed */ + rtc_tm_to_time(&now, &t_now); + rtc_tm_to_time(&alarm->time, &t_alm); + if (t_now < t_alm) + goto done; + + switch (missing) { + + /* 24 hour rollover ... if it's now 10am Monday, an alarm that + * that will trigger at 5am will do so at 5am Tuesday, which + * could also be in the next month or year. This is a common + * case, especially for PCs. + */ + case day: + dev_dbg(&rtc->dev, "alarm rollover: %s\n", "day"); + t_alm += 24 * 60 * 60; + rtc_time_to_tm(t_alm, &alarm->time); + break; + + /* Month rollover ... if it's the 31th, an alarm on the 3rd will + * be next month. An alarm matching on the 30th, 29th, or 28th + * may end up in the month after that! Many newer PCs support + * this type of alarm. + */ + case month: + dev_dbg(&rtc->dev, "alarm rollover: %s\n", "month"); + do { + if (alarm->time.tm_mon < 11) + alarm->time.tm_mon++; + else { + alarm->time.tm_mon = 0; + alarm->time.tm_year++; + } + days = rtc_month_days(alarm->time.tm_mon, + alarm->time.tm_year); + } while (days < alarm->time.tm_mday); + break; + + /* Year rollover ... easy except for leap years! */ + case year: + dev_dbg(&rtc->dev, "alarm rollover: %s\n", "year"); + do { + alarm->time.tm_year++; + } while (rtc_valid_tm(&alarm->time) != 0); + break; + + default: + dev_warn(&rtc->dev, "alarm rollover not handled\n"); + } + +done: + return 0; +} + int rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) { int err; diff --git a/include/linux/rtc.h b/include/linux/rtc.h index 89c3e5182991..db3832d5f280 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -227,6 +227,7 @@ extern void rtc_device_unregister(struct rtc_device *rtc); extern int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm); extern int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm); extern int rtc_set_mmss(struct rtc_device *rtc, unsigned long secs); +int __rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm); extern int rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alrm); extern int rtc_set_alarm(struct rtc_device *rtc, From 80d4bb515b78f38738f3378fd1be6039063ab040 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 3 Feb 2011 11:34:50 -0800 Subject: [PATCH 566/706] RTC: Cleanup rtc_class_ops->irq_set_state With PIE mode interrupts now emulated in generic code via an hrtimer, no one calls rtc_class_ops->irq_set_state(), so this patch removes it along with driver implementations. CC: Thomas Gleixner CC: Alessandro Zummo CC: Marcelo Roberto Jimenez CC: rtc-linux@googlegroups.com Signed-off-by: John Stultz --- drivers/rtc/rtc-cmos.c | 20 -------------------- drivers/rtc/rtc-davinci.c | 34 ---------------------------------- drivers/rtc/rtc-mrst.c | 20 -------------------- drivers/rtc/rtc-pl031.c | 34 ---------------------------------- drivers/rtc/rtc-pxa.c | 13 ------------- drivers/rtc/rtc-rx8025.c | 25 ------------------------- drivers/rtc/rtc-s3c.c | 32 -------------------------------- drivers/rtc/rtc-sa1100.c | 18 ------------------ drivers/rtc/rtc-sh.c | 1 - drivers/rtc/rtc-vr41xx.c | 11 ----------- include/linux/rtc.h | 1 - 11 files changed, 209 deletions(-) diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index c7ff8df347e7..de632e793d46 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -400,25 +400,6 @@ static int cmos_irq_set_freq(struct device *dev, int freq) return 0; } -static int cmos_irq_set_state(struct device *dev, int enabled) -{ - struct cmos_rtc *cmos = dev_get_drvdata(dev); - unsigned long flags; - - if (!is_valid_irq(cmos->irq)) - return -ENXIO; - - spin_lock_irqsave(&rtc_lock, flags); - - if (enabled) - cmos_irq_enable(cmos, RTC_PIE); - else - cmos_irq_disable(cmos, RTC_PIE); - - spin_unlock_irqrestore(&rtc_lock, flags); - return 0; -} - static int cmos_alarm_irq_enable(struct device *dev, unsigned int enabled) { struct cmos_rtc *cmos = dev_get_drvdata(dev); @@ -502,7 +483,6 @@ static const struct rtc_class_ops cmos_rtc_ops = { .set_alarm = cmos_set_alarm, .proc = cmos_procfs, .irq_set_freq = cmos_irq_set_freq, - .irq_set_state = cmos_irq_set_state, .alarm_irq_enable = cmos_alarm_irq_enable, .update_irq_enable = cmos_update_irq_enable, }; diff --git a/drivers/rtc/rtc-davinci.c b/drivers/rtc/rtc-davinci.c index 34647fc1ee98..92da73d40e13 100644 --- a/drivers/rtc/rtc-davinci.c +++ b/drivers/rtc/rtc-davinci.c @@ -473,39 +473,6 @@ static int davinci_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm) return 0; } -static int davinci_rtc_irq_set_state(struct device *dev, int enabled) -{ - struct davinci_rtc *davinci_rtc = dev_get_drvdata(dev); - unsigned long flags; - u8 rtc_ctrl; - - spin_lock_irqsave(&davinci_rtc_lock, flags); - - rtc_ctrl = rtcss_read(davinci_rtc, PRTCSS_RTC_CTRL); - - if (enabled) { - while (rtcss_read(davinci_rtc, PRTCSS_RTC_CTRL) - & PRTCSS_RTC_CTRL_WDTBUS) - cpu_relax(); - - rtc_ctrl |= PRTCSS_RTC_CTRL_TE; - rtcss_write(davinci_rtc, rtc_ctrl, PRTCSS_RTC_CTRL); - - rtcss_write(davinci_rtc, 0x0, PRTCSS_RTC_CLKC_CNT); - - rtc_ctrl |= PRTCSS_RTC_CTRL_TIEN | - PRTCSS_RTC_CTRL_TMMD | - PRTCSS_RTC_CTRL_TMRFLG; - } else - rtc_ctrl &= ~PRTCSS_RTC_CTRL_TIEN; - - rtcss_write(davinci_rtc, rtc_ctrl, PRTCSS_RTC_CTRL); - - spin_unlock_irqrestore(&davinci_rtc_lock, flags); - - return 0; -} - static int davinci_rtc_irq_set_freq(struct device *dev, int freq) { struct davinci_rtc *davinci_rtc = dev_get_drvdata(dev); @@ -529,7 +496,6 @@ static struct rtc_class_ops davinci_rtc_ops = { .alarm_irq_enable = davinci_rtc_alarm_irq_enable, .read_alarm = davinci_rtc_read_alarm, .set_alarm = davinci_rtc_set_alarm, - .irq_set_state = davinci_rtc_irq_set_state, .irq_set_freq = davinci_rtc_irq_set_freq, }; diff --git a/drivers/rtc/rtc-mrst.c b/drivers/rtc/rtc-mrst.c index 1db62db8469d..4db96fa306fc 100644 --- a/drivers/rtc/rtc-mrst.c +++ b/drivers/rtc/rtc-mrst.c @@ -236,25 +236,6 @@ static int mrst_set_alarm(struct device *dev, struct rtc_wkalrm *t) return 0; } -static int mrst_irq_set_state(struct device *dev, int enabled) -{ - struct mrst_rtc *mrst = dev_get_drvdata(dev); - unsigned long flags; - - if (!mrst->irq) - return -ENXIO; - - spin_lock_irqsave(&rtc_lock, flags); - - if (enabled) - mrst_irq_enable(mrst, RTC_PIE); - else - mrst_irq_disable(mrst, RTC_PIE); - - spin_unlock_irqrestore(&rtc_lock, flags); - return 0; -} - /* Currently, the vRTC doesn't support UIE ON/OFF */ static int mrst_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) { @@ -301,7 +282,6 @@ static const struct rtc_class_ops mrst_rtc_ops = { .read_alarm = mrst_read_alarm, .set_alarm = mrst_set_alarm, .proc = mrst_procfs, - .irq_set_state = mrst_irq_set_state, .alarm_irq_enable = mrst_rtc_alarm_irq_enable, }; diff --git a/drivers/rtc/rtc-pl031.c b/drivers/rtc/rtc-pl031.c index b7a6690e5b35..0e7c15b24c1c 100644 --- a/drivers/rtc/rtc-pl031.c +++ b/drivers/rtc/rtc-pl031.c @@ -293,38 +293,6 @@ static int pl031_set_alarm(struct device *dev, struct rtc_wkalrm *alarm) return ret; } -/* Periodic interrupt is only available in ST variants. */ -static int pl031_irq_set_state(struct device *dev, int enabled) -{ - struct pl031_local *ldata = dev_get_drvdata(dev); - - if (enabled == 1) { - /* Clear any pending timer interrupt. */ - writel(RTC_BIT_PI, ldata->base + RTC_ICR); - - writel(readl(ldata->base + RTC_IMSC) | RTC_BIT_PI, - ldata->base + RTC_IMSC); - - /* Now start the timer */ - writel(readl(ldata->base + RTC_TCR) | RTC_TCR_EN, - ldata->base + RTC_TCR); - - } else { - writel(readl(ldata->base + RTC_IMSC) & (~RTC_BIT_PI), - ldata->base + RTC_IMSC); - - /* Also stop the timer */ - writel(readl(ldata->base + RTC_TCR) & (~RTC_TCR_EN), - ldata->base + RTC_TCR); - } - /* Wait at least 1 RTC32 clock cycle to ensure next access - * to RTC_TCR will succeed. - */ - udelay(40); - - return 0; -} - static int pl031_irq_set_freq(struct device *dev, int freq) { struct pl031_local *ldata = dev_get_drvdata(dev); @@ -440,7 +408,6 @@ static struct rtc_class_ops stv1_pl031_ops = { .read_alarm = pl031_read_alarm, .set_alarm = pl031_set_alarm, .alarm_irq_enable = pl031_alarm_irq_enable, - .irq_set_state = pl031_irq_set_state, .irq_set_freq = pl031_irq_set_freq, }; @@ -451,7 +418,6 @@ static struct rtc_class_ops stv2_pl031_ops = { .read_alarm = pl031_stv2_read_alarm, .set_alarm = pl031_stv2_set_alarm, .alarm_irq_enable = pl031_alarm_irq_enable, - .irq_set_state = pl031_irq_set_state, .irq_set_freq = pl031_irq_set_freq, }; diff --git a/drivers/rtc/rtc-pxa.c b/drivers/rtc/rtc-pxa.c index 29e867a1aaa8..b216ae5389c8 100644 --- a/drivers/rtc/rtc-pxa.c +++ b/drivers/rtc/rtc-pxa.c @@ -223,18 +223,6 @@ static int pxa_periodic_irq_set_freq(struct device *dev, int freq) return 0; } -static int pxa_periodic_irq_set_state(struct device *dev, int enabled) -{ - struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev); - - if (enabled) - rtsr_set_bits(pxa_rtc, RTSR_PIALE | RTSR_PICE); - else - rtsr_clear_bits(pxa_rtc, RTSR_PIALE | RTSR_PICE); - - return 0; -} - static int pxa_alarm_irq_enable(struct device *dev, unsigned int enabled) { struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev); @@ -348,7 +336,6 @@ static const struct rtc_class_ops pxa_rtc_ops = { .alarm_irq_enable = pxa_alarm_irq_enable, .update_irq_enable = pxa_update_irq_enable, .proc = pxa_rtc_proc, - .irq_set_state = pxa_periodic_irq_set_state, .irq_set_freq = pxa_periodic_irq_set_freq, }; diff --git a/drivers/rtc/rtc-rx8025.c b/drivers/rtc/rtc-rx8025.c index af32a62e12a8..fde172fb2abe 100644 --- a/drivers/rtc/rtc-rx8025.c +++ b/drivers/rtc/rtc-rx8025.c @@ -424,37 +424,12 @@ static int rx8025_alarm_irq_enable(struct device *dev, unsigned int enabled) return 0; } -static int rx8025_irq_set_state(struct device *dev, int enabled) -{ - struct i2c_client *client = to_i2c_client(dev); - struct rx8025_data *rx8025 = i2c_get_clientdata(client); - int ctrl1; - int err; - - if (client->irq <= 0) - return -ENXIO; - - ctrl1 = rx8025->ctrl1 & ~RX8025_BIT_CTRL1_CT; - if (enabled) - ctrl1 |= RX8025_BIT_CTRL1_CT_1HZ; - if (ctrl1 != rx8025->ctrl1) { - rx8025->ctrl1 = ctrl1; - err = rx8025_write_reg(rx8025->client, RX8025_REG_CTRL1, - rx8025->ctrl1); - if (err) - return err; - } - - return 0; -} - static struct rtc_class_ops rx8025_rtc_ops = { .read_time = rx8025_get_time, .set_time = rx8025_set_time, .read_alarm = rx8025_read_alarm, .set_alarm = rx8025_set_alarm, .alarm_irq_enable = rx8025_alarm_irq_enable, - .irq_set_state = rx8025_irq_set_state, }; /* diff --git a/drivers/rtc/rtc-s3c.c b/drivers/rtc/rtc-s3c.c index b80fa2882408..80fb7e72f9d9 100644 --- a/drivers/rtc/rtc-s3c.c +++ b/drivers/rtc/rtc-s3c.c @@ -93,37 +93,6 @@ static int s3c_rtc_setaie(struct device *dev, unsigned int enabled) return 0; } -static int s3c_rtc_setpie(struct device *dev, int enabled) -{ - unsigned int tmp; - - pr_debug("%s: pie=%d\n", __func__, enabled); - - spin_lock_irq(&s3c_rtc_pie_lock); - - if (s3c_rtc_cpu_type == TYPE_S3C64XX) { - tmp = readw(s3c_rtc_base + S3C2410_RTCCON); - tmp &= ~S3C64XX_RTCCON_TICEN; - - if (enabled) - tmp |= S3C64XX_RTCCON_TICEN; - - writew(tmp, s3c_rtc_base + S3C2410_RTCCON); - } else { - tmp = readb(s3c_rtc_base + S3C2410_TICNT); - tmp &= ~S3C2410_TICNT_ENABLE; - - if (enabled) - tmp |= S3C2410_TICNT_ENABLE; - - writeb(tmp, s3c_rtc_base + S3C2410_TICNT); - } - - spin_unlock_irq(&s3c_rtc_pie_lock); - - return 0; -} - static int s3c_rtc_setfreq(struct device *dev, int freq) { struct platform_device *pdev = to_platform_device(dev); @@ -380,7 +349,6 @@ static const struct rtc_class_ops s3c_rtcops = { .read_alarm = s3c_rtc_getalarm, .set_alarm = s3c_rtc_setalarm, .irq_set_freq = s3c_rtc_setfreq, - .irq_set_state = s3c_rtc_setpie, .proc = s3c_rtc_proc, .alarm_irq_enable = s3c_rtc_setaie, }; diff --git a/drivers/rtc/rtc-sa1100.c b/drivers/rtc/rtc-sa1100.c index 5dfe5ffcb0d3..d47b3fc9830f 100644 --- a/drivers/rtc/rtc-sa1100.c +++ b/drivers/rtc/rtc-sa1100.c @@ -171,23 +171,6 @@ static int sa1100_irq_set_freq(struct device *dev, int freq) static int rtc_timer1_count; -static int sa1100_irq_set_state(struct device *dev, int enabled) -{ - spin_lock_irq(&sa1100_rtc_lock); - if (enabled) { - struct rtc_device *rtc = (struct rtc_device *)dev; - - OSMR1 = timer_freq / rtc->irq_freq + OSCR; - OIER |= OIER_E1; - rtc_timer1_count = 1; - } else { - OIER &= ~OIER_E1; - } - spin_unlock_irq(&sa1100_rtc_lock); - - return 0; -} - static inline int sa1100_timer1_retrigger(struct rtc_device *rtc) { unsigned long diff; @@ -410,7 +393,6 @@ static const struct rtc_class_ops sa1100_rtc_ops = { .set_alarm = sa1100_rtc_set_alarm, .proc = sa1100_rtc_proc, .irq_set_freq = sa1100_irq_set_freq, - .irq_set_state = sa1100_irq_set_state, .alarm_irq_enable = sa1100_rtc_alarm_irq_enable, }; diff --git a/drivers/rtc/rtc-sh.c b/drivers/rtc/rtc-sh.c index 93314a9e7fa9..ff50a8bc13f6 100644 --- a/drivers/rtc/rtc-sh.c +++ b/drivers/rtc/rtc-sh.c @@ -603,7 +603,6 @@ static struct rtc_class_ops sh_rtc_ops = { .set_time = sh_rtc_set_time, .read_alarm = sh_rtc_read_alarm, .set_alarm = sh_rtc_set_alarm, - .irq_set_state = sh_rtc_irq_set_state, .irq_set_freq = sh_rtc_irq_set_freq, .proc = sh_rtc_proc, .alarm_irq_enable = sh_rtc_alarm_irq_enable, diff --git a/drivers/rtc/rtc-vr41xx.c b/drivers/rtc/rtc-vr41xx.c index 769190ac6d11..86f14909f9db 100644 --- a/drivers/rtc/rtc-vr41xx.c +++ b/drivers/rtc/rtc-vr41xx.c @@ -227,16 +227,6 @@ static int vr41xx_rtc_irq_set_freq(struct device *dev, int freq) return 0; } -static int vr41xx_rtc_irq_set_state(struct device *dev, int enabled) -{ - if (enabled) - enable_irq(pie_irq); - else - disable_irq(pie_irq); - - return 0; -} - static int vr41xx_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg) { switch (cmd) { @@ -309,7 +299,6 @@ static const struct rtc_class_ops vr41xx_rtc_ops = { .read_alarm = vr41xx_rtc_read_alarm, .set_alarm = vr41xx_rtc_set_alarm, .irq_set_freq = vr41xx_rtc_irq_set_freq, - .irq_set_state = vr41xx_rtc_irq_set_state, }; static int __devinit rtc_probe(struct platform_device *pdev) diff --git a/include/linux/rtc.h b/include/linux/rtc.h index db3832d5f280..0e2063acc260 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -148,7 +148,6 @@ struct rtc_class_ops { int (*set_alarm)(struct device *, struct rtc_wkalrm *); int (*proc)(struct device *, struct seq_file *); int (*set_mmss)(struct device *, unsigned long secs); - int (*irq_set_state)(struct device *, int enabled); int (*irq_set_freq)(struct device *, int freq); int (*read_callback)(struct device *, int data); int (*alarm_irq_enable)(struct device *, unsigned int enabled); From 696160fec162601d06940862b5b3aa4460344c1b Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 3 Feb 2011 12:02:07 -0800 Subject: [PATCH 567/706] RTC: Cleanup rtc_class_ops->irq_set_freq() With the generic rtc code now emulating PIE mode irqs via an hrtimer, no one calls the rtc_class_ops->irq_set_freq call. This patch removes the hook and deletes the driver functions if no one else calls them. CC: Thomas Gleixner CC: Alessandro Zummo CC: Marcelo Roberto Jimenez CC: rtc-linux@googlegroups.com Signed-off-by: John Stultz --- drivers/rtc/rtc-cmos.c | 26 -------------------------- drivers/rtc/rtc-davinci.c | 17 ----------------- drivers/rtc/rtc-pl031.c | 21 --------------------- drivers/rtc/rtc-pxa.c | 15 --------------- drivers/rtc/rtc-s3c.c | 1 - drivers/rtc/rtc-sa1100.c | 1 - drivers/rtc/rtc-sh.c | 1 - drivers/rtc/rtc-vr41xx.c | 21 --------------------- include/linux/rtc.h | 2 -- 9 files changed, 105 deletions(-) diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index de632e793d46..bdb1f8e2042a 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -375,31 +375,6 @@ static int cmos_set_alarm(struct device *dev, struct rtc_wkalrm *t) return 0; } -static int cmos_irq_set_freq(struct device *dev, int freq) -{ - struct cmos_rtc *cmos = dev_get_drvdata(dev); - int f; - unsigned long flags; - - if (!is_valid_irq(cmos->irq)) - return -ENXIO; - - if (!is_power_of_2(freq)) - return -EINVAL; - /* 0 = no irqs; 1 = 2^15 Hz ... 15 = 2^0 Hz */ - f = ffs(freq); - if (f-- > 16) - return -EINVAL; - f = 16 - f; - - spin_lock_irqsave(&rtc_lock, flags); - hpet_set_periodic_freq(freq); - CMOS_WRITE(RTC_REF_CLCK_32KHZ | f, RTC_FREQ_SELECT); - spin_unlock_irqrestore(&rtc_lock, flags); - - return 0; -} - static int cmos_alarm_irq_enable(struct device *dev, unsigned int enabled) { struct cmos_rtc *cmos = dev_get_drvdata(dev); @@ -482,7 +457,6 @@ static const struct rtc_class_ops cmos_rtc_ops = { .read_alarm = cmos_read_alarm, .set_alarm = cmos_set_alarm, .proc = cmos_procfs, - .irq_set_freq = cmos_irq_set_freq, .alarm_irq_enable = cmos_alarm_irq_enable, .update_irq_enable = cmos_update_irq_enable, }; diff --git a/drivers/rtc/rtc-davinci.c b/drivers/rtc/rtc-davinci.c index 92da73d40e13..dfd98a235ad1 100644 --- a/drivers/rtc/rtc-davinci.c +++ b/drivers/rtc/rtc-davinci.c @@ -473,22 +473,6 @@ static int davinci_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm) return 0; } -static int davinci_rtc_irq_set_freq(struct device *dev, int freq) -{ - struct davinci_rtc *davinci_rtc = dev_get_drvdata(dev); - unsigned long flags; - u16 tmr_counter = (0x8000 >> (ffs(freq) - 1)); - - spin_lock_irqsave(&davinci_rtc_lock, flags); - - rtcss_write(davinci_rtc, tmr_counter & 0xFF, PRTCSS_RTC_TMR0); - rtcss_write(davinci_rtc, (tmr_counter & 0xFF00) >> 8, PRTCSS_RTC_TMR1); - - spin_unlock_irqrestore(&davinci_rtc_lock, flags); - - return 0; -} - static struct rtc_class_ops davinci_rtc_ops = { .ioctl = davinci_rtc_ioctl, .read_time = davinci_rtc_read_time, @@ -496,7 +480,6 @@ static struct rtc_class_ops davinci_rtc_ops = { .alarm_irq_enable = davinci_rtc_alarm_irq_enable, .read_alarm = davinci_rtc_read_alarm, .set_alarm = davinci_rtc_set_alarm, - .irq_set_freq = davinci_rtc_irq_set_freq, }; static int __init davinci_rtc_probe(struct platform_device *pdev) diff --git a/drivers/rtc/rtc-pl031.c b/drivers/rtc/rtc-pl031.c index 0e7c15b24c1c..d829ea63c4fb 100644 --- a/drivers/rtc/rtc-pl031.c +++ b/drivers/rtc/rtc-pl031.c @@ -293,25 +293,6 @@ static int pl031_set_alarm(struct device *dev, struct rtc_wkalrm *alarm) return ret; } -static int pl031_irq_set_freq(struct device *dev, int freq) -{ - struct pl031_local *ldata = dev_get_drvdata(dev); - - /* Cant set timer if it is already enabled */ - if (readl(ldata->base + RTC_TCR) & RTC_TCR_EN) { - dev_err(dev, "can't change frequency while timer enabled\n"); - return -EINVAL; - } - - /* If self start bit in RTC_TCR is set timer will start here, - * but we never set that bit. Instead we start the timer when - * set_state is called with enabled == 1. - */ - writel(RTC_TIMER_FREQ / freq, ldata->base + RTC_TLR); - - return 0; -} - static int pl031_remove(struct amba_device *adev) { struct pl031_local *ldata = dev_get_drvdata(&adev->dev); @@ -408,7 +389,6 @@ static struct rtc_class_ops stv1_pl031_ops = { .read_alarm = pl031_read_alarm, .set_alarm = pl031_set_alarm, .alarm_irq_enable = pl031_alarm_irq_enable, - .irq_set_freq = pl031_irq_set_freq, }; /* And the second ST derivative */ @@ -418,7 +398,6 @@ static struct rtc_class_ops stv2_pl031_ops = { .read_alarm = pl031_stv2_read_alarm, .set_alarm = pl031_stv2_set_alarm, .alarm_irq_enable = pl031_alarm_irq_enable, - .irq_set_freq = pl031_irq_set_freq, }; static struct amba_id pl031_ids[] = { diff --git a/drivers/rtc/rtc-pxa.c b/drivers/rtc/rtc-pxa.c index b216ae5389c8..a1fdc802598a 100644 --- a/drivers/rtc/rtc-pxa.c +++ b/drivers/rtc/rtc-pxa.c @@ -209,20 +209,6 @@ static void pxa_rtc_release(struct device *dev) free_irq(pxa_rtc->irq_1Hz, dev); } -static int pxa_periodic_irq_set_freq(struct device *dev, int freq) -{ - struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev); - int period_ms; - - if (freq < 1 || freq > MAXFREQ_PERIODIC) - return -EINVAL; - - period_ms = 1000 / freq; - rtc_writel(pxa_rtc, PIAR, period_ms); - - return 0; -} - static int pxa_alarm_irq_enable(struct device *dev, unsigned int enabled) { struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev); @@ -336,7 +322,6 @@ static const struct rtc_class_ops pxa_rtc_ops = { .alarm_irq_enable = pxa_alarm_irq_enable, .update_irq_enable = pxa_update_irq_enable, .proc = pxa_rtc_proc, - .irq_set_freq = pxa_periodic_irq_set_freq, }; static int __init pxa_rtc_probe(struct platform_device *pdev) diff --git a/drivers/rtc/rtc-s3c.c b/drivers/rtc/rtc-s3c.c index 80fb7e72f9d9..714964913e5e 100644 --- a/drivers/rtc/rtc-s3c.c +++ b/drivers/rtc/rtc-s3c.c @@ -348,7 +348,6 @@ static const struct rtc_class_ops s3c_rtcops = { .set_time = s3c_rtc_settime, .read_alarm = s3c_rtc_getalarm, .set_alarm = s3c_rtc_setalarm, - .irq_set_freq = s3c_rtc_setfreq, .proc = s3c_rtc_proc, .alarm_irq_enable = s3c_rtc_setaie, }; diff --git a/drivers/rtc/rtc-sa1100.c b/drivers/rtc/rtc-sa1100.c index d47b3fc9830f..d1a2b0bc3b24 100644 --- a/drivers/rtc/rtc-sa1100.c +++ b/drivers/rtc/rtc-sa1100.c @@ -392,7 +392,6 @@ static const struct rtc_class_ops sa1100_rtc_ops = { .read_alarm = sa1100_rtc_read_alarm, .set_alarm = sa1100_rtc_set_alarm, .proc = sa1100_rtc_proc, - .irq_set_freq = sa1100_irq_set_freq, .alarm_irq_enable = sa1100_rtc_alarm_irq_enable, }; diff --git a/drivers/rtc/rtc-sh.c b/drivers/rtc/rtc-sh.c index ff50a8bc13f6..148544979a54 100644 --- a/drivers/rtc/rtc-sh.c +++ b/drivers/rtc/rtc-sh.c @@ -603,7 +603,6 @@ static struct rtc_class_ops sh_rtc_ops = { .set_time = sh_rtc_set_time, .read_alarm = sh_rtc_read_alarm, .set_alarm = sh_rtc_set_alarm, - .irq_set_freq = sh_rtc_irq_set_freq, .proc = sh_rtc_proc, .alarm_irq_enable = sh_rtc_alarm_irq_enable, }; diff --git a/drivers/rtc/rtc-vr41xx.c b/drivers/rtc/rtc-vr41xx.c index 86f14909f9db..c5698cda366a 100644 --- a/drivers/rtc/rtc-vr41xx.c +++ b/drivers/rtc/rtc-vr41xx.c @@ -207,26 +207,6 @@ static int vr41xx_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *wkalrm) return 0; } -static int vr41xx_rtc_irq_set_freq(struct device *dev, int freq) -{ - u64 count; - - if (!is_power_of_2(freq)) - return -EINVAL; - count = RTC_FREQUENCY; - do_div(count, freq); - - spin_lock_irq(&rtc_lock); - - periodic_count = count; - rtc1_write(RTCL1LREG, periodic_count); - rtc1_write(RTCL1HREG, periodic_count >> 16); - - spin_unlock_irq(&rtc_lock); - - return 0; -} - static int vr41xx_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg) { switch (cmd) { @@ -298,7 +278,6 @@ static const struct rtc_class_ops vr41xx_rtc_ops = { .set_time = vr41xx_rtc_set_time, .read_alarm = vr41xx_rtc_read_alarm, .set_alarm = vr41xx_rtc_set_alarm, - .irq_set_freq = vr41xx_rtc_irq_set_freq, }; static int __devinit rtc_probe(struct platform_device *pdev) diff --git a/include/linux/rtc.h b/include/linux/rtc.h index 0e2063acc260..741a51cc4460 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -133,7 +133,6 @@ extern struct class *rtc_class; * The (current) exceptions are mostly filesystem hooks: * - the proc() hook for procfs * - non-ioctl() chardev hooks: open(), release(), read_callback() - * - periodic irq calls: irq_set_state(), irq_set_freq() * * REVISIT those periodic irq calls *do* have ops_lock when they're * issued through ioctl() ... @@ -148,7 +147,6 @@ struct rtc_class_ops { int (*set_alarm)(struct device *, struct rtc_wkalrm *); int (*proc)(struct device *, struct seq_file *); int (*set_mmss)(struct device *, unsigned long secs); - int (*irq_set_freq)(struct device *, int freq); int (*read_callback)(struct device *, int data); int (*alarm_irq_enable)(struct device *, unsigned int enabled); int (*update_irq_enable)(struct device *, unsigned int enabled); From 51ba60c5bb3b0f71bee26404ddc22d8e4109e88a Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 3 Feb 2011 12:13:50 -0800 Subject: [PATCH 568/706] RTC: Cleanup rtc_class_ops->update_irq_enable() Now that the generic code handles UIE mode irqs via periodic alarm interrupts, no one calls the rtc_class_ops->update_irq_enable() method anymore. This patch removes the driver hooks and implementations of update_irq_enable if no one else is calling it. CC: Thomas Gleixner CC: Alessandro Zummo CC: Marcelo Roberto Jimenez CC: rtc-linux@googlegroups.com Signed-off-by: John Stultz --- drivers/rtc/rtc-cmos.c | 20 -------------------- drivers/rtc/rtc-ds1511.c | 17 ----------------- drivers/rtc/rtc-ds1553.c | 17 ----------------- drivers/rtc/rtc-ds3232.c | 18 ------------------ drivers/rtc/rtc-jz4740.c | 7 ------- drivers/rtc/rtc-mc13xxx.c | 7 ------- drivers/rtc/rtc-mpc5121.c | 20 -------------------- drivers/rtc/rtc-mxc.c | 7 ------- drivers/rtc/rtc-nuc900.c | 15 --------------- drivers/rtc/rtc-pcap.c | 6 ------ drivers/rtc/rtc-pcf50633.c | 22 +--------------------- drivers/rtc/rtc-pxa.c | 16 ---------------- drivers/rtc/rtc-stmp3xxx.c | 15 --------------- drivers/rtc/rtc-twl.c | 13 ------------- drivers/rtc/rtc-wm831x.c | 16 ---------------- drivers/rtc/rtc-wm8350.c | 21 --------------------- include/linux/rtc.h | 1 - 17 files changed, 1 insertion(+), 237 deletions(-) diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index bdb1f8e2042a..dc2a0ba969ce 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -394,25 +394,6 @@ static int cmos_alarm_irq_enable(struct device *dev, unsigned int enabled) return 0; } -static int cmos_update_irq_enable(struct device *dev, unsigned int enabled) -{ - struct cmos_rtc *cmos = dev_get_drvdata(dev); - unsigned long flags; - - if (!is_valid_irq(cmos->irq)) - return -EINVAL; - - spin_lock_irqsave(&rtc_lock, flags); - - if (enabled) - cmos_irq_enable(cmos, RTC_UIE); - else - cmos_irq_disable(cmos, RTC_UIE); - - spin_unlock_irqrestore(&rtc_lock, flags); - return 0; -} - #if defined(CONFIG_RTC_INTF_PROC) || defined(CONFIG_RTC_INTF_PROC_MODULE) static int cmos_procfs(struct device *dev, struct seq_file *seq) @@ -458,7 +439,6 @@ static const struct rtc_class_ops cmos_rtc_ops = { .set_alarm = cmos_set_alarm, .proc = cmos_procfs, .alarm_irq_enable = cmos_alarm_irq_enable, - .update_irq_enable = cmos_update_irq_enable, }; /*----------------------------------------------------------------*/ diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c index 37268e97de49..3fffd708711f 100644 --- a/drivers/rtc/rtc-ds1511.c +++ b/drivers/rtc/rtc-ds1511.c @@ -397,29 +397,12 @@ static int ds1511_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) return 0; } -static int ds1511_rtc_update_irq_enable(struct device *dev, - unsigned int enabled) -{ - struct platform_device *pdev = to_platform_device(dev); - struct rtc_plat_data *pdata = platform_get_drvdata(pdev); - - if (pdata->irq <= 0) - return -EINVAL; - if (enabled) - pdata->irqen |= RTC_UF; - else - pdata->irqen &= ~RTC_UF; - ds1511_rtc_update_alarm(pdata); - return 0; -} - static const struct rtc_class_ops ds1511_rtc_ops = { .read_time = ds1511_rtc_read_time, .set_time = ds1511_rtc_set_time, .read_alarm = ds1511_rtc_read_alarm, .set_alarm = ds1511_rtc_set_alarm, .alarm_irq_enable = ds1511_rtc_alarm_irq_enable, - .update_irq_enable = ds1511_rtc_update_irq_enable, }; static ssize_t diff --git a/drivers/rtc/rtc-ds1553.c b/drivers/rtc/rtc-ds1553.c index ff432e2ca275..fee41b97c9e8 100644 --- a/drivers/rtc/rtc-ds1553.c +++ b/drivers/rtc/rtc-ds1553.c @@ -227,29 +227,12 @@ static int ds1553_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) return 0; } -static int ds1553_rtc_update_irq_enable(struct device *dev, - unsigned int enabled) -{ - struct platform_device *pdev = to_platform_device(dev); - struct rtc_plat_data *pdata = platform_get_drvdata(pdev); - - if (pdata->irq <= 0) - return -EINVAL; - if (enabled) - pdata->irqen |= RTC_UF; - else - pdata->irqen &= ~RTC_UF; - ds1553_rtc_update_alarm(pdata); - return 0; -} - static const struct rtc_class_ops ds1553_rtc_ops = { .read_time = ds1553_rtc_read_time, .set_time = ds1553_rtc_set_time, .read_alarm = ds1553_rtc_read_alarm, .set_alarm = ds1553_rtc_set_alarm, .alarm_irq_enable = ds1553_rtc_alarm_irq_enable, - .update_irq_enable = ds1553_rtc_update_irq_enable, }; static ssize_t ds1553_nvram_read(struct file *filp, struct kobject *kobj, diff --git a/drivers/rtc/rtc-ds3232.c b/drivers/rtc/rtc-ds3232.c index 950735415a7c..27b7bf672ac6 100644 --- a/drivers/rtc/rtc-ds3232.c +++ b/drivers/rtc/rtc-ds3232.c @@ -339,23 +339,6 @@ static int ds3232_alarm_irq_enable(struct device *dev, unsigned int enabled) return 0; } -static int ds3232_update_irq_enable(struct device *dev, unsigned int enabled) -{ - struct i2c_client *client = to_i2c_client(dev); - struct ds3232 *ds3232 = i2c_get_clientdata(client); - - if (client->irq <= 0) - return -EINVAL; - - if (enabled) - ds3232->rtc->irq_data |= RTC_UF; - else - ds3232->rtc->irq_data &= ~RTC_UF; - - ds3232_update_alarm(client); - return 0; -} - static irqreturn_t ds3232_irq(int irq, void *dev_id) { struct i2c_client *client = dev_id; @@ -406,7 +389,6 @@ static const struct rtc_class_ops ds3232_rtc_ops = { .read_alarm = ds3232_read_alarm, .set_alarm = ds3232_set_alarm, .alarm_irq_enable = ds3232_alarm_irq_enable, - .update_irq_enable = ds3232_update_irq_enable, }; static int __devinit ds3232_probe(struct i2c_client *client, diff --git a/drivers/rtc/rtc-jz4740.c b/drivers/rtc/rtc-jz4740.c index 2e16f72c9056..b6473631d182 100644 --- a/drivers/rtc/rtc-jz4740.c +++ b/drivers/rtc/rtc-jz4740.c @@ -168,12 +168,6 @@ static int jz4740_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) return ret; } -static int jz4740_rtc_update_irq_enable(struct device *dev, unsigned int enable) -{ - struct jz4740_rtc *rtc = dev_get_drvdata(dev); - return jz4740_rtc_ctrl_set_bits(rtc, JZ_RTC_CTRL_1HZ_IRQ, enable); -} - static int jz4740_rtc_alarm_irq_enable(struct device *dev, unsigned int enable) { struct jz4740_rtc *rtc = dev_get_drvdata(dev); @@ -185,7 +179,6 @@ static struct rtc_class_ops jz4740_rtc_ops = { .set_mmss = jz4740_rtc_set_mmss, .read_alarm = jz4740_rtc_read_alarm, .set_alarm = jz4740_rtc_set_alarm, - .update_irq_enable = jz4740_rtc_update_irq_enable, .alarm_irq_enable = jz4740_rtc_alarm_irq_enable, }; diff --git a/drivers/rtc/rtc-mc13xxx.c b/drivers/rtc/rtc-mc13xxx.c index 5314b153bfba..c42006469559 100644 --- a/drivers/rtc/rtc-mc13xxx.c +++ b/drivers/rtc/rtc-mc13xxx.c @@ -282,12 +282,6 @@ static irqreturn_t mc13xxx_rtc_update_handler(int irq, void *dev) return IRQ_HANDLED; } -static int mc13xxx_rtc_update_irq_enable(struct device *dev, - unsigned int enabled) -{ - return mc13xxx_rtc_irq_enable(dev, enabled, MC13XXX_IRQ_1HZ); -} - static int mc13xxx_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) { @@ -300,7 +294,6 @@ static const struct rtc_class_ops mc13xxx_rtc_ops = { .read_alarm = mc13xxx_rtc_read_alarm, .set_alarm = mc13xxx_rtc_set_alarm, .alarm_irq_enable = mc13xxx_rtc_alarm_irq_enable, - .update_irq_enable = mc13xxx_rtc_update_irq_enable, }; static irqreturn_t mc13xxx_rtc_reset_handler(int irq, void *dev) diff --git a/drivers/rtc/rtc-mpc5121.c b/drivers/rtc/rtc-mpc5121.c index dfcdf0901d21..b40c1ff1ebc8 100644 --- a/drivers/rtc/rtc-mpc5121.c +++ b/drivers/rtc/rtc-mpc5121.c @@ -240,32 +240,12 @@ static int mpc5121_rtc_alarm_irq_enable(struct device *dev, return 0; } -static int mpc5121_rtc_update_irq_enable(struct device *dev, - unsigned int enabled) -{ - struct mpc5121_rtc_data *rtc = dev_get_drvdata(dev); - struct mpc5121_rtc_regs __iomem *regs = rtc->regs; - int val; - - val = in_8(®s->int_enable); - - if (enabled) - val = (val & ~0x8) | 0x1; - else - val &= ~0x1; - - out_8(®s->int_enable, val); - - return 0; -} - static const struct rtc_class_ops mpc5121_rtc_ops = { .read_time = mpc5121_rtc_read_time, .set_time = mpc5121_rtc_set_time, .read_alarm = mpc5121_rtc_read_alarm, .set_alarm = mpc5121_rtc_set_alarm, .alarm_irq_enable = mpc5121_rtc_alarm_irq_enable, - .update_irq_enable = mpc5121_rtc_update_irq_enable, }; static int __devinit mpc5121_rtc_probe(struct platform_device *op, diff --git a/drivers/rtc/rtc-mxc.c b/drivers/rtc/rtc-mxc.c index 0b06c1e03fd5..826ab64a8fa9 100644 --- a/drivers/rtc/rtc-mxc.c +++ b/drivers/rtc/rtc-mxc.c @@ -274,12 +274,6 @@ static int mxc_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) return 0; } -static int mxc_rtc_update_irq_enable(struct device *dev, unsigned int enabled) -{ - mxc_rtc_irq_enable(dev, RTC_1HZ_BIT, enabled); - return 0; -} - /* * This function reads the current RTC time into tm in Gregorian date. */ @@ -368,7 +362,6 @@ static struct rtc_class_ops mxc_rtc_ops = { .read_alarm = mxc_rtc_read_alarm, .set_alarm = mxc_rtc_set_alarm, .alarm_irq_enable = mxc_rtc_alarm_irq_enable, - .update_irq_enable = mxc_rtc_update_irq_enable, }; static int __init mxc_rtc_probe(struct platform_device *pdev) diff --git a/drivers/rtc/rtc-nuc900.c b/drivers/rtc/rtc-nuc900.c index ddb0857e15a4..781068d62f23 100644 --- a/drivers/rtc/rtc-nuc900.c +++ b/drivers/rtc/rtc-nuc900.c @@ -134,20 +134,6 @@ static void nuc900_rtc_bin2bcd(struct device *dev, struct rtc_time *settm, gettm->bcd_hour = bin2bcd(settm->tm_hour) << 16; } -static int nuc900_update_irq_enable(struct device *dev, unsigned int enabled) -{ - struct nuc900_rtc *rtc = dev_get_drvdata(dev); - - if (enabled) - __raw_writel(__raw_readl(rtc->rtc_reg + REG_RTC_RIER)| - (TICKINTENB), rtc->rtc_reg + REG_RTC_RIER); - else - __raw_writel(__raw_readl(rtc->rtc_reg + REG_RTC_RIER)& - (~TICKINTENB), rtc->rtc_reg + REG_RTC_RIER); - - return 0; -} - static int nuc900_alarm_irq_enable(struct device *dev, unsigned int enabled) { struct nuc900_rtc *rtc = dev_get_drvdata(dev); @@ -234,7 +220,6 @@ static struct rtc_class_ops nuc900_rtc_ops = { .read_alarm = nuc900_rtc_read_alarm, .set_alarm = nuc900_rtc_set_alarm, .alarm_irq_enable = nuc900_alarm_irq_enable, - .update_irq_enable = nuc900_update_irq_enable, }; static int __devinit nuc900_rtc_probe(struct platform_device *pdev) diff --git a/drivers/rtc/rtc-pcap.c b/drivers/rtc/rtc-pcap.c index 25c0b3fd44f1..a633abc42896 100644 --- a/drivers/rtc/rtc-pcap.c +++ b/drivers/rtc/rtc-pcap.c @@ -131,18 +131,12 @@ static int pcap_rtc_alarm_irq_enable(struct device *dev, unsigned int en) return pcap_rtc_irq_enable(dev, PCAP_IRQ_TODA, en); } -static int pcap_rtc_update_irq_enable(struct device *dev, unsigned int en) -{ - return pcap_rtc_irq_enable(dev, PCAP_IRQ_1HZ, en); -} - static const struct rtc_class_ops pcap_rtc_ops = { .read_time = pcap_rtc_read_time, .read_alarm = pcap_rtc_read_alarm, .set_alarm = pcap_rtc_set_alarm, .set_mmss = pcap_rtc_set_mmss, .alarm_irq_enable = pcap_rtc_alarm_irq_enable, - .update_irq_enable = pcap_rtc_update_irq_enable, }; static int __devinit pcap_rtc_probe(struct platform_device *pdev) diff --git a/drivers/rtc/rtc-pcf50633.c b/drivers/rtc/rtc-pcf50633.c index 16edf94ab42f..f90c574f9d05 100644 --- a/drivers/rtc/rtc-pcf50633.c +++ b/drivers/rtc/rtc-pcf50633.c @@ -106,25 +106,6 @@ pcf50633_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) return 0; } -static int -pcf50633_rtc_update_irq_enable(struct device *dev, unsigned int enabled) -{ - struct pcf50633_rtc *rtc = dev_get_drvdata(dev); - int err; - - if (enabled) - err = pcf50633_irq_unmask(rtc->pcf, PCF50633_IRQ_SECOND); - else - err = pcf50633_irq_mask(rtc->pcf, PCF50633_IRQ_SECOND); - - if (err < 0) - return err; - - rtc->second_enabled = enabled; - - return 0; -} - static int pcf50633_rtc_read_time(struct device *dev, struct rtc_time *tm) { struct pcf50633_rtc *rtc; @@ -262,8 +243,7 @@ static struct rtc_class_ops pcf50633_rtc_ops = { .set_time = pcf50633_rtc_set_time, .read_alarm = pcf50633_rtc_read_alarm, .set_alarm = pcf50633_rtc_set_alarm, - .alarm_irq_enable = pcf50633_rtc_alarm_irq_enable, - .update_irq_enable = pcf50633_rtc_update_irq_enable, + .alarm_irq_enable = pcf50633_rtc_alarm_irq_enable, }; static void pcf50633_rtc_irq(int irq, void *data) diff --git a/drivers/rtc/rtc-pxa.c b/drivers/rtc/rtc-pxa.c index a1fdc802598a..fc9f4991574b 100644 --- a/drivers/rtc/rtc-pxa.c +++ b/drivers/rtc/rtc-pxa.c @@ -224,21 +224,6 @@ static int pxa_alarm_irq_enable(struct device *dev, unsigned int enabled) return 0; } -static int pxa_update_irq_enable(struct device *dev, unsigned int enabled) -{ - struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev); - - spin_lock_irq(&pxa_rtc->lock); - - if (enabled) - rtsr_set_bits(pxa_rtc, RTSR_HZE); - else - rtsr_clear_bits(pxa_rtc, RTSR_HZE); - - spin_unlock_irq(&pxa_rtc->lock); - return 0; -} - static int pxa_rtc_read_time(struct device *dev, struct rtc_time *tm) { struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev); @@ -320,7 +305,6 @@ static const struct rtc_class_ops pxa_rtc_ops = { .read_alarm = pxa_rtc_read_alarm, .set_alarm = pxa_rtc_set_alarm, .alarm_irq_enable = pxa_alarm_irq_enable, - .update_irq_enable = pxa_update_irq_enable, .proc = pxa_rtc_proc, }; diff --git a/drivers/rtc/rtc-stmp3xxx.c b/drivers/rtc/rtc-stmp3xxx.c index 7e7d0c806f2d..572e9534b591 100644 --- a/drivers/rtc/rtc-stmp3xxx.c +++ b/drivers/rtc/rtc-stmp3xxx.c @@ -115,19 +115,6 @@ static int stmp3xxx_alarm_irq_enable(struct device *dev, unsigned int enabled) return 0; } -static int stmp3xxx_update_irq_enable(struct device *dev, unsigned int enabled) -{ - struct stmp3xxx_rtc_data *rtc_data = dev_get_drvdata(dev); - - if (enabled) - stmp3xxx_setl(BM_RTC_CTRL_ONEMSEC_IRQ_EN, - rtc_data->io + HW_RTC_CTRL); - else - stmp3xxx_clearl(BM_RTC_CTRL_ONEMSEC_IRQ_EN, - rtc_data->io + HW_RTC_CTRL); - return 0; -} - static int stmp3xxx_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alm) { struct stmp3xxx_rtc_data *rtc_data = dev_get_drvdata(dev); @@ -149,8 +136,6 @@ static int stmp3xxx_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm) static struct rtc_class_ops stmp3xxx_rtc_ops = { .alarm_irq_enable = stmp3xxx_alarm_irq_enable, - .update_irq_enable = - stmp3xxx_update_irq_enable, .read_time = stmp3xxx_rtc_gettime, .set_mmss = stmp3xxx_rtc_set_mmss, .read_alarm = stmp3xxx_rtc_read_alarm, diff --git a/drivers/rtc/rtc-twl.c b/drivers/rtc/rtc-twl.c index ed1b86828124..f9a2799c44d6 100644 --- a/drivers/rtc/rtc-twl.c +++ b/drivers/rtc/rtc-twl.c @@ -213,18 +213,6 @@ static int twl_rtc_alarm_irq_enable(struct device *dev, unsigned enabled) return ret; } -static int twl_rtc_update_irq_enable(struct device *dev, unsigned enabled) -{ - int ret; - - if (enabled) - ret = set_rtc_irq_bit(BIT_RTC_INTERRUPTS_REG_IT_TIMER_M); - else - ret = mask_rtc_irq_bit(BIT_RTC_INTERRUPTS_REG_IT_TIMER_M); - - return ret; -} - /* * Gets current TWL RTC time and date parameters. * @@ -433,7 +421,6 @@ static struct rtc_class_ops twl_rtc_ops = { .read_alarm = twl_rtc_read_alarm, .set_alarm = twl_rtc_set_alarm, .alarm_irq_enable = twl_rtc_alarm_irq_enable, - .update_irq_enable = twl_rtc_update_irq_enable, }; /*----------------------------------------------------------------------*/ diff --git a/drivers/rtc/rtc-wm831x.c b/drivers/rtc/rtc-wm831x.c index 82931dc65c0b..bdc909bd56da 100644 --- a/drivers/rtc/rtc-wm831x.c +++ b/drivers/rtc/rtc-wm831x.c @@ -315,21 +315,6 @@ static int wm831x_rtc_alarm_irq_enable(struct device *dev, return wm831x_rtc_stop_alarm(wm831x_rtc); } -static int wm831x_rtc_update_irq_enable(struct device *dev, - unsigned int enabled) -{ - struct wm831x_rtc *wm831x_rtc = dev_get_drvdata(dev); - int val; - - if (enabled) - val = 1 << WM831X_RTC_PINT_FREQ_SHIFT; - else - val = 0; - - return wm831x_set_bits(wm831x_rtc->wm831x, WM831X_RTC_CONTROL, - WM831X_RTC_PINT_FREQ_MASK, val); -} - static irqreturn_t wm831x_alm_irq(int irq, void *data) { struct wm831x_rtc *wm831x_rtc = data; @@ -354,7 +339,6 @@ static const struct rtc_class_ops wm831x_rtc_ops = { .read_alarm = wm831x_rtc_readalarm, .set_alarm = wm831x_rtc_setalarm, .alarm_irq_enable = wm831x_rtc_alarm_irq_enable, - .update_irq_enable = wm831x_rtc_update_irq_enable, }; #ifdef CONFIG_PM diff --git a/drivers/rtc/rtc-wm8350.c b/drivers/rtc/rtc-wm8350.c index 3d0dc76b38af..66421426e404 100644 --- a/drivers/rtc/rtc-wm8350.c +++ b/drivers/rtc/rtc-wm8350.c @@ -302,26 +302,6 @@ static int wm8350_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm) return ret; } -static int wm8350_rtc_update_irq_enable(struct device *dev, - unsigned int enabled) -{ - struct wm8350 *wm8350 = dev_get_drvdata(dev); - - /* Suppress duplicate changes since genirq nests enable and - * disable calls. */ - if (enabled == wm8350->rtc.update_enabled) - return 0; - - if (enabled) - wm8350_unmask_irq(wm8350, WM8350_IRQ_RTC_SEC); - else - wm8350_mask_irq(wm8350, WM8350_IRQ_RTC_SEC); - - wm8350->rtc.update_enabled = enabled; - - return 0; -} - static irqreturn_t wm8350_rtc_alarm_handler(int irq, void *data) { struct wm8350 *wm8350 = data; @@ -357,7 +337,6 @@ static const struct rtc_class_ops wm8350_rtc_ops = { .read_alarm = wm8350_rtc_readalarm, .set_alarm = wm8350_rtc_setalarm, .alarm_irq_enable = wm8350_rtc_alarm_irq_enable, - .update_irq_enable = wm8350_rtc_update_irq_enable, }; #ifdef CONFIG_PM diff --git a/include/linux/rtc.h b/include/linux/rtc.h index 741a51cc4460..2ca7e8a78060 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -149,7 +149,6 @@ struct rtc_class_ops { int (*set_mmss)(struct device *, unsigned long secs); int (*read_callback)(struct device *, int data); int (*alarm_irq_enable)(struct device *, unsigned int enabled); - int (*update_irq_enable)(struct device *, unsigned int enabled); }; #define RTC_DEVICE_NAME_SIZE 20 From e428c6a2772bcf6b022baf7c8267cca3634c0c3e Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 4 Feb 2011 16:16:12 -0800 Subject: [PATCH 569/706] RTC: Clean out UIE icotl implementations With the generic RTC rework, the UIE mode irqs are handled in the generic layer, and only hardware specific ioctls get passed down to the rtc driver layer. So this patch removes the UIE mode ioctl handling in the rtc driver layer, which never get used. CC: Thomas Gleixner CC: Alessandro Zummo CC: Marcelo Roberto Jimenez CC: rtc-linux@googlegroups.com Signed-off-by: John Stultz --- drivers/rtc/rtc-at91rm9200.c | 28 ------------------- drivers/rtc/rtc-at91sam9.c | 28 ------------------- drivers/rtc/rtc-bfin.c | 27 ------------------- drivers/rtc/rtc-davinci.c | 4 --- drivers/rtc/rtc-omap.c | 39 --------------------------- drivers/rtc/rtc-pl030.c | 6 ----- drivers/rtc/rtc-rs5c372.c | 52 ------------------------------------ drivers/rtc/rtc-sa1100.c | 19 ------------- drivers/rtc/rtc-sh.c | 22 --------------- 9 files changed, 225 deletions(-) diff --git a/drivers/rtc/rtc-at91rm9200.c b/drivers/rtc/rtc-at91rm9200.c index 26d1cf5d19ae..518a76ec71ca 100644 --- a/drivers/rtc/rtc-at91rm9200.c +++ b/drivers/rtc/rtc-at91rm9200.c @@ -183,33 +183,6 @@ static int at91_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm) return 0; } -/* - * Handle commands from user-space - */ -static int at91_rtc_ioctl(struct device *dev, unsigned int cmd, - unsigned long arg) -{ - int ret = 0; - - pr_debug("%s(): cmd=%08x, arg=%08lx.\n", __func__, cmd, arg); - - /* important: scrub old status before enabling IRQs */ - switch (cmd) { - case RTC_UIE_OFF: /* update off */ - at91_sys_write(AT91_RTC_IDR, AT91_RTC_SECEV); - break; - case RTC_UIE_ON: /* update on */ - at91_sys_write(AT91_RTC_SCCR, AT91_RTC_SECEV); - at91_sys_write(AT91_RTC_IER, AT91_RTC_SECEV); - break; - default: - ret = -ENOIOCTLCMD; - break; - } - - return ret; -} - static int at91_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) { pr_debug("%s(): cmd=%08x\n", __func__, enabled); @@ -269,7 +242,6 @@ static irqreturn_t at91_rtc_interrupt(int irq, void *dev_id) } static const struct rtc_class_ops at91_rtc_ops = { - .ioctl = at91_rtc_ioctl, .read_time = at91_rtc_readtime, .set_time = at91_rtc_settime, .read_alarm = at91_rtc_readalarm, diff --git a/drivers/rtc/rtc-at91sam9.c b/drivers/rtc/rtc-at91sam9.c index 5469c52cba3d..a3ad957507dc 100644 --- a/drivers/rtc/rtc-at91sam9.c +++ b/drivers/rtc/rtc-at91sam9.c @@ -216,33 +216,6 @@ static int at91_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm) return 0; } -/* - * Handle commands from user-space - */ -static int at91_rtc_ioctl(struct device *dev, unsigned int cmd, - unsigned long arg) -{ - struct sam9_rtc *rtc = dev_get_drvdata(dev); - int ret = 0; - u32 mr = rtt_readl(rtc, MR); - - dev_dbg(dev, "ioctl: cmd=%08x, arg=%08lx, mr %08x\n", cmd, arg, mr); - - switch (cmd) { - case RTC_UIE_OFF: /* update off */ - rtt_writel(rtc, MR, mr & ~AT91_RTT_RTTINCIEN); - break; - case RTC_UIE_ON: /* update on */ - rtt_writel(rtc, MR, mr | AT91_RTT_RTTINCIEN); - break; - default: - ret = -ENOIOCTLCMD; - break; - } - - return ret; -} - static int at91_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) { struct sam9_rtc *rtc = dev_get_drvdata(dev); @@ -303,7 +276,6 @@ static irqreturn_t at91_rtc_interrupt(int irq, void *_rtc) } static const struct rtc_class_ops at91_rtc_ops = { - .ioctl = at91_rtc_ioctl, .read_time = at91_rtc_readtime, .set_time = at91_rtc_settime, .read_alarm = at91_rtc_readalarm, diff --git a/drivers/rtc/rtc-bfin.c b/drivers/rtc/rtc-bfin.c index 17971d93354d..ca9cff85ab8a 100644 --- a/drivers/rtc/rtc-bfin.c +++ b/drivers/rtc/rtc-bfin.c @@ -240,32 +240,6 @@ static void bfin_rtc_int_set_alarm(struct bfin_rtc *rtc) */ bfin_rtc_int_set(rtc->rtc_alarm.tm_yday == -1 ? RTC_ISTAT_ALARM : RTC_ISTAT_ALARM_DAY); } -static int bfin_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg) -{ - struct bfin_rtc *rtc = dev_get_drvdata(dev); - int ret = 0; - - dev_dbg_stamp(dev); - - bfin_rtc_sync_pending(dev); - - switch (cmd) { - case RTC_UIE_ON: - dev_dbg_stamp(dev); - bfin_rtc_int_set(RTC_ISTAT_SEC); - break; - case RTC_UIE_OFF: - dev_dbg_stamp(dev); - bfin_rtc_int_clear(~RTC_ISTAT_SEC); - break; - - default: - dev_dbg_stamp(dev); - ret = -ENOIOCTLCMD; - } - - return ret; -} static int bfin_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) { @@ -358,7 +332,6 @@ static int bfin_rtc_proc(struct device *dev, struct seq_file *seq) } static struct rtc_class_ops bfin_rtc_ops = { - .ioctl = bfin_rtc_ioctl, .read_time = bfin_rtc_read_time, .set_time = bfin_rtc_set_time, .read_alarm = bfin_rtc_read_alarm, diff --git a/drivers/rtc/rtc-davinci.c b/drivers/rtc/rtc-davinci.c index dfd98a235ad1..8d46838dff8a 100644 --- a/drivers/rtc/rtc-davinci.c +++ b/drivers/rtc/rtc-davinci.c @@ -231,10 +231,6 @@ davinci_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg) case RTC_WIE_OFF: rtc_ctrl &= ~PRTCSS_RTC_CTRL_WEN; break; - case RTC_UIE_OFF: - case RTC_UIE_ON: - ret = -ENOTTY; - break; default: ret = -ENOIOCTLCMD; } diff --git a/drivers/rtc/rtc-omap.c b/drivers/rtc/rtc-omap.c index b4dbf3a319b3..de0dd7b1f146 100644 --- a/drivers/rtc/rtc-omap.c +++ b/drivers/rtc/rtc-omap.c @@ -135,44 +135,6 @@ static irqreturn_t rtc_irq(int irq, void *rtc) return IRQ_HANDLED; } -#ifdef CONFIG_RTC_INTF_DEV - -static int -omap_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg) -{ - u8 reg; - - switch (cmd) { - case RTC_UIE_OFF: - case RTC_UIE_ON: - break; - default: - return -ENOIOCTLCMD; - } - - local_irq_disable(); - rtc_wait_not_busy(); - reg = rtc_read(OMAP_RTC_INTERRUPTS_REG); - switch (cmd) { - /* UIE = Update Interrupt Enable (1/second) */ - case RTC_UIE_OFF: - reg &= ~OMAP_RTC_INTERRUPTS_IT_TIMER; - break; - case RTC_UIE_ON: - reg |= OMAP_RTC_INTERRUPTS_IT_TIMER; - break; - } - rtc_wait_not_busy(); - rtc_write(reg, OMAP_RTC_INTERRUPTS_REG); - local_irq_enable(); - - return 0; -} - -#else -#define omap_rtc_ioctl NULL -#endif - static int omap_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) { u8 reg; @@ -313,7 +275,6 @@ static int omap_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm) } static struct rtc_class_ops omap_rtc_ops = { - .ioctl = omap_rtc_ioctl, .read_time = omap_rtc_read_time, .set_time = omap_rtc_set_time, .read_alarm = omap_rtc_read_alarm, diff --git a/drivers/rtc/rtc-pl030.c b/drivers/rtc/rtc-pl030.c index bbdb2f02798a..d554368c9f57 100644 --- a/drivers/rtc/rtc-pl030.c +++ b/drivers/rtc/rtc-pl030.c @@ -35,11 +35,6 @@ static irqreturn_t pl030_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static int pl030_ioctl(struct device *dev, unsigned int cmd, unsigned long arg) -{ - return -ENOIOCTLCMD; -} - static int pl030_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) { struct pl030_rtc *rtc = dev_get_drvdata(dev); @@ -96,7 +91,6 @@ static int pl030_set_time(struct device *dev, struct rtc_time *tm) } static const struct rtc_class_ops pl030_ops = { - .ioctl = pl030_ioctl, .read_time = pl030_read_time, .set_time = pl030_set_time, .read_alarm = pl030_read_alarm, diff --git a/drivers/rtc/rtc-rs5c372.c b/drivers/rtc/rtc-rs5c372.c index 6aaa1550e3b1..85c1b848dd72 100644 --- a/drivers/rtc/rtc-rs5c372.c +++ b/drivers/rtc/rtc-rs5c372.c @@ -281,57 +281,6 @@ static int rs5c372_rtc_set_time(struct device *dev, struct rtc_time *tm) return rs5c372_set_datetime(to_i2c_client(dev), tm); } -#if defined(CONFIG_RTC_INTF_DEV) || defined(CONFIG_RTC_INTF_DEV_MODULE) - -static int -rs5c_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg) -{ - struct i2c_client *client = to_i2c_client(dev); - struct rs5c372 *rs5c = i2c_get_clientdata(client); - unsigned char buf; - int status, addr; - - buf = rs5c->regs[RS5C_REG_CTRL1]; - switch (cmd) { - case RTC_UIE_OFF: - case RTC_UIE_ON: - /* some 327a modes use a different IRQ pin for 1Hz irqs */ - if (rs5c->type == rtc_rs5c372a - && (buf & RS5C372A_CTRL1_SL1)) - return -ENOIOCTLCMD; - default: - return -ENOIOCTLCMD; - } - - status = rs5c_get_regs(rs5c); - if (status < 0) - return status; - - addr = RS5C_ADDR(RS5C_REG_CTRL1); - switch (cmd) { - case RTC_UIE_OFF: /* update off */ - buf &= ~RS5C_CTRL1_CT_MASK; - break; - case RTC_UIE_ON: /* update on */ - buf &= ~RS5C_CTRL1_CT_MASK; - buf |= RS5C_CTRL1_CT4; - break; - } - - if (i2c_smbus_write_byte_data(client, addr, buf) < 0) { - printk(KERN_WARNING "%s: can't update alarm\n", - rs5c->rtc->name); - status = -EIO; - } else - rs5c->regs[RS5C_REG_CTRL1] = buf; - - return status; -} - -#else -#define rs5c_rtc_ioctl NULL -#endif - static int rs5c_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) { @@ -480,7 +429,6 @@ static int rs5c372_rtc_proc(struct device *dev, struct seq_file *seq) static const struct rtc_class_ops rs5c372_rtc_ops = { .proc = rs5c372_rtc_proc, - .ioctl = rs5c_rtc_ioctl, .read_time = rs5c372_rtc_read_time, .set_time = rs5c372_rtc_set_time, .read_alarm = rs5c_read_alarm, diff --git a/drivers/rtc/rtc-sa1100.c b/drivers/rtc/rtc-sa1100.c index d1a2b0bc3b24..a9189337a2c5 100644 --- a/drivers/rtc/rtc-sa1100.c +++ b/drivers/rtc/rtc-sa1100.c @@ -293,24 +293,6 @@ static void sa1100_rtc_release(struct device *dev) } -static int sa1100_rtc_ioctl(struct device *dev, unsigned int cmd, - unsigned long arg) -{ - switch (cmd) { - case RTC_UIE_OFF: - spin_lock_irq(&sa1100_rtc_lock); - RTSR &= ~RTSR_HZE; - spin_unlock_irq(&sa1100_rtc_lock); - return 0; - case RTC_UIE_ON: - spin_lock_irq(&sa1100_rtc_lock); - RTSR |= RTSR_HZE; - spin_unlock_irq(&sa1100_rtc_lock); - return 0; - } - return -ENOIOCTLCMD; -} - static int sa1100_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) { spin_lock_irq(&sa1100_rtc_lock); @@ -386,7 +368,6 @@ static const struct rtc_class_ops sa1100_rtc_ops = { .open = sa1100_rtc_open, .read_callback = sa1100_rtc_read_callback, .release = sa1100_rtc_release, - .ioctl = sa1100_rtc_ioctl, .read_time = sa1100_rtc_read_time, .set_time = sa1100_rtc_set_time, .read_alarm = sa1100_rtc_read_alarm, diff --git a/drivers/rtc/rtc-sh.c b/drivers/rtc/rtc-sh.c index 148544979a54..e55dc1ac83ab 100644 --- a/drivers/rtc/rtc-sh.c +++ b/drivers/rtc/rtc-sh.c @@ -344,27 +344,6 @@ static inline void sh_rtc_setcie(struct device *dev, unsigned int enable) spin_unlock_irq(&rtc->lock); } -static int sh_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg) -{ - struct sh_rtc *rtc = dev_get_drvdata(dev); - unsigned int ret = 0; - - switch (cmd) { - case RTC_UIE_OFF: - rtc->periodic_freq &= ~PF_OXS; - sh_rtc_setcie(dev, 0); - break; - case RTC_UIE_ON: - rtc->periodic_freq |= PF_OXS; - sh_rtc_setcie(dev, 1); - break; - default: - ret = -ENOIOCTLCMD; - } - - return ret; -} - static int sh_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) { sh_rtc_setaie(dev, enabled); @@ -598,7 +577,6 @@ static int sh_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *wkalrm) } static struct rtc_class_ops sh_rtc_ops = { - .ioctl = sh_rtc_ioctl, .read_time = sh_rtc_read_time, .set_time = sh_rtc_set_time, .read_alarm = sh_rtc_read_alarm, From bca8521c551afcd926bdc8f814ebaefcb8215c57 Mon Sep 17 00:00:00 2001 From: Marcelo Roberto Jimenez Date: Fri, 11 Feb 2011 11:50:24 -0200 Subject: [PATCH 570/706] RTC: Include information about UIE and PIE in RTC driver proc. Generic RTC code is always able to provide the necessary information about update and periodic interrupts. This patch add such information to the proc interface. CC: Thomas Gleixner CC: Alessandro Zummo CC: Marcelo Roberto Jimenez CC: rtc-linux@googlegroups.com Signed-off-by: Marcelo Roberto Jimenez Signed-off-by: John Stultz --- drivers/rtc/rtc-proc.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/rtc/rtc-proc.c b/drivers/rtc/rtc-proc.c index 242bbf86c74a..0a59fda5c09d 100644 --- a/drivers/rtc/rtc-proc.c +++ b/drivers/rtc/rtc-proc.c @@ -69,6 +69,14 @@ static int rtc_proc_show(struct seq_file *seq, void *offset) alrm.enabled ? "yes" : "no"); seq_printf(seq, "alrm_pending\t: %s\n", alrm.pending ? "yes" : "no"); + seq_printf(seq, "update IRQ enabled\t: %s\n", + (rtc->uie_rtctimer.enabled) ? "yes" : "no"); + seq_printf(seq, "periodic IRQ enabled\t: %s\n", + (rtc->pie_enabled) ? "yes" : "no"); + seq_printf(seq, "periodic IRQ frequency\t: %d\n", + rtc->irq_freq); + seq_printf(seq, "max user IRQ frequency\t: %d\n", + rtc->max_user_freq); } seq_printf(seq, "24hr\t\t: yes\n"); From 4cebe7aadc9ee8e7b44857b7aba3a878870cef65 Mon Sep 17 00:00:00 2001 From: Marcelo Roberto Jimenez Date: Mon, 7 Feb 2011 19:16:06 -0200 Subject: [PATCH 571/706] RTC: Remove UIE and PIE information from the sa1100 driver proc. This patch removes the UIE and PIE information that is now being supplied directly in the generic RTC code. CC: Thomas Gleixner CC: Alessandro Zummo CC: Marcelo Roberto Jimenez CC: rtc-linux@googlegroups.com Signed-off-by: Marcelo Roberto Jimenez Signed-off-by: John Stultz --- drivers/rtc/rtc-sa1100.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/rtc/rtc-sa1100.c b/drivers/rtc/rtc-sa1100.c index a9189337a2c5..41f62ca68dc4 100644 --- a/drivers/rtc/rtc-sa1100.c +++ b/drivers/rtc/rtc-sa1100.c @@ -351,15 +351,8 @@ static int sa1100_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) static int sa1100_rtc_proc(struct device *dev, struct seq_file *seq) { - struct rtc_device *rtc = (struct rtc_device *)dev; - - seq_printf(seq, "trim/divider\t: 0x%08x\n", (u32) RTTR); - seq_printf(seq, "update_IRQ\t: %s\n", - (RTSR & RTSR_HZE) ? "yes" : "no"); - seq_printf(seq, "periodic_IRQ\t: %s\n", - (OIER & OIER_E1) ? "yes" : "no"); - seq_printf(seq, "periodic_freq\t: %d\n", rtc->irq_freq); - seq_printf(seq, "RTSR\t\t: 0x%08x\n", (u32)RTSR); + seq_printf(seq, "trim/divider\t\t: 0x%08x\n", (u32) RTTR); + seq_printf(seq, "RTSR\t\t\t: 0x%08x\n", (u32)RTSR); return 0; } From a417493ef916b8b6d1782a589766a713c553842e Mon Sep 17 00:00:00 2001 From: Marcelo Roberto Jimenez Date: Mon, 7 Feb 2011 19:16:07 -0200 Subject: [PATCH 572/706] RTC: Fix the cross interrupt issue on rtc-test. The rtc-test driver is meant to provide a test/debug code for the RTC subsystem. The rtc-test driver simulates specific interrupts by echoing to the sys interface. Those were the update, alarm and periodic interrupts. As a side effect of the new implementation, any interrupt generated in the rtc-test driver would trigger the same code path in the generic code, and thus the distinction among interrupts gets lost. This patch preserves the previous behaviour of the rtc-test driver, where e.g. an update interrupt would not trigger an alarm or periodic interrupt, and vice-versa. In real world RTC drivers, this is not an issue, but in the rtc-test driver it may be interesting to distinguish these interrupts for testing purposes. CC: Thomas Gleixner CC: Alessandro Zummo CC: Marcelo Roberto Jimenez CC: rtc-linux@googlegroups.com Signed-off-by: Marcelo Roberto Jimenez Signed-off-by: John Stultz --- drivers/rtc/rtc-test.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/rtc/rtc-test.c b/drivers/rtc/rtc-test.c index a82d6fe97076..7e96254bd365 100644 --- a/drivers/rtc/rtc-test.c +++ b/drivers/rtc/rtc-test.c @@ -78,11 +78,16 @@ static ssize_t test_irq_store(struct device *dev, struct rtc_device *rtc = platform_get_drvdata(plat_dev); retval = count; - if (strncmp(buf, "tick", 4) == 0) + if (strncmp(buf, "tick", 4) == 0 && rtc->pie_enabled) rtc_update_irq(rtc, 1, RTC_PF | RTC_IRQF); - else if (strncmp(buf, "alarm", 5) == 0) - rtc_update_irq(rtc, 1, RTC_AF | RTC_IRQF); - else if (strncmp(buf, "update", 6) == 0) + else if (strncmp(buf, "alarm", 5) == 0) { + struct rtc_wkalrm alrm; + int err = rtc_read_alarm(rtc, &alrm); + + if (!err && alrm.enabled) + rtc_update_irq(rtc, 1, RTC_AF | RTC_IRQF); + + } else if (strncmp(buf, "update", 6) == 0 && rtc->uie_rtctimer.enabled) rtc_update_irq(rtc, 1, RTC_UF | RTC_IRQF); else retval = -EINVAL; From 416f0e8056f757c119dc3d4fa434a62b65c8272b Mon Sep 17 00:00:00 2001 From: Marcelo Roberto Jimenez Date: Mon, 7 Feb 2011 19:16:08 -0200 Subject: [PATCH 573/706] RTC: sa1100: Update the sa1100 RTC driver. Since PIE interrupts are now emulated, this patch removes the previous code that used the hardware counters. The removal of read_callback() also fixes a wrong user space behaviour of this driver, which was not returning the right value to read(). [john.stultz: Merge fixups] CC: Thomas Gleixner CC: Alessandro Zummo CC: Marcelo Roberto Jimenez CC: rtc-linux@googlegroups.com Signed-off-by: Marcelo Roberto Jimenez Signed-off-by: John Stultz --- drivers/rtc/rtc-sa1100.c | 111 ++------------------------------------- 1 file changed, 3 insertions(+), 108 deletions(-) diff --git a/drivers/rtc/rtc-sa1100.c b/drivers/rtc/rtc-sa1100.c index 41f62ca68dc4..0b40bb88a884 100644 --- a/drivers/rtc/rtc-sa1100.c +++ b/drivers/rtc/rtc-sa1100.c @@ -43,7 +43,6 @@ #define RTC_DEF_TRIM 0 static const unsigned long RTC_FREQ = 1024; -static unsigned long timer_freq; static struct rtc_time rtc_alarm; static DEFINE_SPINLOCK(sa1100_rtc_lock); @@ -156,97 +155,11 @@ static irqreturn_t sa1100_rtc_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static int sa1100_irq_set_freq(struct device *dev, int freq) -{ - if (freq < 1 || freq > timer_freq) { - return -EINVAL; - } else { - struct rtc_device *rtc = (struct rtc_device *)dev; - - rtc->irq_freq = freq; - - return 0; - } -} - -static int rtc_timer1_count; - -static inline int sa1100_timer1_retrigger(struct rtc_device *rtc) -{ - unsigned long diff; - unsigned long period = timer_freq / rtc->irq_freq; - - spin_lock_irq(&sa1100_rtc_lock); - - do { - OSMR1 += period; - diff = OSMR1 - OSCR; - /* If OSCR > OSMR1, diff is a very large number (unsigned - * math). This means we have a lost interrupt. */ - } while (diff > period); - OIER |= OIER_E1; - - spin_unlock_irq(&sa1100_rtc_lock); - - return 0; -} - -static irqreturn_t timer1_interrupt(int irq, void *dev_id) -{ - struct platform_device *pdev = to_platform_device(dev_id); - struct rtc_device *rtc = platform_get_drvdata(pdev); - - /* - * If we match for the first time, rtc_timer1_count will be 1. - * Otherwise, we wrapped around (very unlikely but - * still possible) so compute the amount of missed periods. - * The match reg is updated only when the data is actually retrieved - * to avoid unnecessary interrupts. - */ - OSSR = OSSR_M1; /* clear match on timer1 */ - - rtc_update_irq(rtc, rtc_timer1_count, RTC_PF | RTC_IRQF); - - if (rtc_timer1_count == 1) - rtc_timer1_count = - (rtc->irq_freq * ((1 << 30) / (timer_freq >> 2))); - - /* retrigger. */ - sa1100_timer1_retrigger(rtc); - - return IRQ_HANDLED; -} - -static int sa1100_rtc_read_callback(struct device *dev, int data) -{ - if (data & RTC_PF) { - struct rtc_device *rtc = (struct rtc_device *)dev; - - /* interpolate missed periods and set match for the next */ - unsigned long period = timer_freq / rtc->irq_freq; - unsigned long oscr = OSCR; - unsigned long osmr1 = OSMR1; - unsigned long missed = (oscr - osmr1)/period; - data += missed << 8; - OSSR = OSSR_M1; /* clear match on timer 1 */ - OSMR1 = osmr1 + (missed + 1)*period; - /* Ensure we didn't miss another match in the mean time. - * Here we compare (match - OSCR) 8 instead of 0 -- - * see comment in pxa_timer_interrupt() for explanation. - */ - while ((signed long)((osmr1 = OSMR1) - OSCR) <= 8) { - data += 0x100; - OSSR = OSSR_M1; /* clear match on timer 1 */ - OSMR1 = osmr1 + period; - } - } - return data; -} - static int sa1100_rtc_open(struct device *dev) { int ret; - struct rtc_device *rtc = (struct rtc_device *)dev; + struct platform_device *plat_dev = to_platform_device(dev); + struct rtc_device *rtc = platform_get_drvdata(plat_dev); ret = request_irq(IRQ_RTC1Hz, sa1100_rtc_interrupt, IRQF_DISABLED, "rtc 1Hz", dev); @@ -260,19 +173,11 @@ static int sa1100_rtc_open(struct device *dev) dev_err(dev, "IRQ %d already in use.\n", IRQ_RTCAlrm); goto fail_ai; } - ret = request_irq(IRQ_OST1, timer1_interrupt, IRQF_DISABLED, - "rtc timer", dev); - if (ret) { - dev_err(dev, "IRQ %d already in use.\n", IRQ_OST1); - goto fail_pi; - } rtc->max_user_freq = RTC_FREQ; - sa1100_irq_set_freq(dev, RTC_FREQ); + rtc_irq_set_freq(rtc, NULL, RTC_FREQ); return 0; - fail_pi: - free_irq(IRQ_RTCAlrm, dev); fail_ai: free_irq(IRQ_RTC1Hz, dev); fail_ui: @@ -287,12 +192,10 @@ static void sa1100_rtc_release(struct device *dev) OSSR = OSSR_M1; spin_unlock_irq(&sa1100_rtc_lock); - free_irq(IRQ_OST1, dev); free_irq(IRQ_RTCAlrm, dev); free_irq(IRQ_RTC1Hz, dev); } - static int sa1100_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) { spin_lock_irq(&sa1100_rtc_lock); @@ -359,7 +262,6 @@ static int sa1100_rtc_proc(struct device *dev, struct seq_file *seq) static const struct rtc_class_ops sa1100_rtc_ops = { .open = sa1100_rtc_open, - .read_callback = sa1100_rtc_read_callback, .release = sa1100_rtc_release, .read_time = sa1100_rtc_read_time, .set_time = sa1100_rtc_set_time, @@ -373,8 +275,6 @@ static int sa1100_rtc_probe(struct platform_device *pdev) { struct rtc_device *rtc; - timer_freq = get_clock_tick_rate(); - /* * According to the manual we should be able to let RTTR be zero * and then a default diviser for a 32.768KHz clock is used. @@ -400,11 +300,6 @@ static int sa1100_rtc_probe(struct platform_device *pdev) platform_set_drvdata(pdev, rtc); - /* Set the irq_freq */ - /*TODO: Find out who is messing with this value after we initialize - * it here.*/ - rtc->irq_freq = RTC_FREQ; - /* Fix for a nasty initialization problem the in SA11xx RTSR register. * See also the comments in sa1100_rtc_interrupt(). * From ea04683f592e6200b52e191b7e2842aedcfd88b6 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 10 Feb 2011 15:32:59 -0800 Subject: [PATCH 574/706] RTC: Fix up rtc.txt documentation to reflect changes to generic rtc layer Now that the genric RTC layer handles much of the RTC functionality, the rtc.txt documentation needs to be updated to remove outdated information. CC: Thomas Gleixner CC: Alessandro Zummo CC: Marcelo Roberto Jimenez CC: rtc-linux@googlegroups.com Signed-off-by: John Stultz --- Documentation/rtc.txt | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/Documentation/rtc.txt b/Documentation/rtc.txt index 9104c1062084..250160469d83 100644 --- a/Documentation/rtc.txt +++ b/Documentation/rtc.txt @@ -178,38 +178,29 @@ RTC class framework, but can't be supported by the older driver. setting the longer alarm time and enabling its IRQ using a single request (using the same model as EFI firmware). - * RTC_UIE_ON, RTC_UIE_OFF ... if the RTC offers IRQs, it probably - also offers update IRQs whenever the "seconds" counter changes. - If needed, the RTC framework can emulate this mechanism. + * RTC_UIE_ON, RTC_UIE_OFF ... if the RTC offers IRQs, the RTC framework + will emulate this mechanism. - * RTC_PIE_ON, RTC_PIE_OFF, RTC_IRQP_SET, RTC_IRQP_READ ... another - feature often accessible with an IRQ line is a periodic IRQ, issued - at settable frequencies (usually 2^N Hz). + * RTC_PIE_ON, RTC_PIE_OFF, RTC_IRQP_SET, RTC_IRQP_READ ... these icotls + are emulated via a kernel hrtimer. In many cases, the RTC alarm can be a system wake event, used to force Linux out of a low power sleep state (or hibernation) back to a fully operational state. For example, a system could enter a deep power saving state until it's time to execute some scheduled tasks. -Note that many of these ioctls need not actually be implemented by your -driver. The common rtc-dev interface handles many of these nicely if your -driver returns ENOIOCTLCMD. Some common examples: +Note that many of these ioctls are handled by the common rtc-dev interface. +Some common examples: * RTC_RD_TIME, RTC_SET_TIME: the read_time/set_time functions will be called with appropriate values. - * RTC_ALM_SET, RTC_ALM_READ, RTC_WKALM_SET, RTC_WKALM_RD: the - set_alarm/read_alarm functions will be called. + * RTC_ALM_SET, RTC_ALM_READ, RTC_WKALM_SET, RTC_WKALM_RD: gets or sets + the alarm rtc_timer. May call the set_alarm driver function. - * RTC_IRQP_SET, RTC_IRQP_READ: the irq_set_freq function will be called - to set the frequency while the framework will handle the read for you - since the frequency is stored in the irq_freq member of the rtc_device - structure. Your driver needs to initialize the irq_freq member during - init. Make sure you check the requested frequency is in range of your - hardware in the irq_set_freq function. If it isn't, return -EINVAL. If - you cannot actually change the frequency, do not define irq_set_freq. + * RTC_IRQP_SET, RTC_IRQP_READ: These are emulated by the generic code. - * RTC_PIE_ON, RTC_PIE_OFF: the irq_set_state function will be called. + * RTC_PIE_ON, RTC_PIE_OFF: These are also emulated by the generic code. If all else fails, check out the rtc-test.c driver! From b9a46bba88001504235459c8410f17e6a7e38008 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 7 Mar 2011 21:13:40 +0100 Subject: [PATCH 575/706] perf top: Fix events overflow in top command The snprintf function returns number of printed characters even if it cross the size parameter. So passing enough events via '-e' parameter will cause segmentation fault. It's reproduced by following command: perf top -e `perf list | grep Tracepoint | awk -F'[' '\ {gsub(/[[:space:]]+/,"",$1);array[FNR]=$1}END{outputs=array[1];\ for (i=2;i<=FNR;i++){ outputs=outputs "," array[i];};print outputs}'` Attached patch is adding SNPRINTF macro that provides the overflow check and returns actuall number of printed characters. Reported-by: Han Pingtian Cc: Han Pingtian Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra LKML-Reference: <1299528821-17521-2-git-send-email-jolsa@redhat.com> Signed-off-by: Jiri Olsa Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/top.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/tools/perf/util/top.c b/tools/perf/util/top.c index 70a9c13f4ad5..4f869da4e9c7 100644 --- a/tools/perf/util/top.c +++ b/tools/perf/util/top.c @@ -61,6 +61,12 @@ static void rb_insert_active_sym(struct rb_root *tree, struct sym_entry *se) rb_insert_color(&se->rb_node, tree); } +#define SNPRINTF(buf, size, fmt, args...) \ +({ \ + size_t r = snprintf(buf, size, fmt, ## args); \ + r > size ? size : r; \ +}) + size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size) { struct perf_evsel *counter; @@ -70,7 +76,7 @@ size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size) size_t ret = 0; if (!perf_guest) { - ret = snprintf(bf, size, + ret = SNPRINTF(bf, size, " PerfTop:%8.0f irqs/sec kernel:%4.1f%%" " exact: %4.1f%% [", samples_per_sec, 100.0 - (100.0 * ((samples_per_sec - ksamples_per_sec) / @@ -81,7 +87,7 @@ size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size) float guest_kernel_samples_per_sec = top->guest_kernel_samples / top->delay_secs; float guest_us_samples_per_sec = top->guest_us_samples / top->delay_secs; - ret = snprintf(bf, size, + ret = SNPRINTF(bf, size, " PerfTop:%8.0f irqs/sec kernel:%4.1f%% us:%4.1f%%" " guest kernel:%4.1f%% guest us:%4.1f%%" " exact: %4.1f%% [", samples_per_sec, @@ -101,38 +107,38 @@ size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size) if (top->evlist->nr_entries == 1 || !top->display_weighted) { struct perf_evsel *first; first = list_entry(top->evlist->entries.next, struct perf_evsel, node); - ret += snprintf(bf + ret, size - ret, "%" PRIu64 "%s ", + ret += SNPRINTF(bf + ret, size - ret, "%" PRIu64 "%s ", (uint64_t)first->attr.sample_period, top->freq ? "Hz" : ""); } if (!top->display_weighted) { - ret += snprintf(bf + ret, size - ret, "%s", + ret += SNPRINTF(bf + ret, size - ret, "%s", event_name(top->sym_evsel)); } else list_for_each_entry(counter, &top->evlist->entries, node) { - ret += snprintf(bf + ret, size - ret, "%s%s", + ret += SNPRINTF(bf + ret, size - ret, "%s%s", counter->idx ? "/" : "", event_name(counter)); } - ret += snprintf(bf + ret, size - ret, "], "); + ret += SNPRINTF(bf + ret, size - ret, "], "); if (top->target_pid != -1) - ret += snprintf(bf + ret, size - ret, " (target_pid: %d", + ret += SNPRINTF(bf + ret, size - ret, " (target_pid: %d", top->target_pid); else if (top->target_tid != -1) - ret += snprintf(bf + ret, size - ret, " (target_tid: %d", + ret += SNPRINTF(bf + ret, size - ret, " (target_tid: %d", top->target_tid); else - ret += snprintf(bf + ret, size - ret, " (all"); + ret += SNPRINTF(bf + ret, size - ret, " (all"); if (top->cpu_list) - ret += snprintf(bf + ret, size - ret, ", CPU%s: %s)", + ret += SNPRINTF(bf + ret, size - ret, ", CPU%s: %s)", top->evlist->cpus->nr > 1 ? "s" : "", top->cpu_list); else { if (top->target_tid != -1) - ret += snprintf(bf + ret, size - ret, ")"); + ret += SNPRINTF(bf + ret, size - ret, ")"); else - ret += snprintf(bf + ret, size - ret, ", %d CPU%s)", + ret += SNPRINTF(bf + ret, size - ret, ", %d CPU%s)", top->evlist->cpus->nr, top->evlist->cpus->nr > 1 ? "s" : ""); } From 6547250381eb315acff3d52b4872ad775359407c Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 7 Mar 2011 21:13:41 +0100 Subject: [PATCH 576/706] perf top: Don't let events to eat up whole header line Passing multiple events might force out information about pid/tid/cpu. Attached patch leaves 30 characters for this info at the expense of the events' names. Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Han Pingtian LKML-Reference: <1299528821-17521-3-git-send-email-jolsa@redhat.com> Signed-off-by: Jiri Olsa Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/top.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/top.c b/tools/perf/util/top.c index 4f869da4e9c7..75cfe4d45119 100644 --- a/tools/perf/util/top.c +++ b/tools/perf/util/top.c @@ -115,9 +115,23 @@ size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size) if (!top->display_weighted) { ret += SNPRINTF(bf + ret, size - ret, "%s", event_name(top->sym_evsel)); - } else list_for_each_entry(counter, &top->evlist->entries, node) { - ret += SNPRINTF(bf + ret, size - ret, "%s%s", - counter->idx ? "/" : "", event_name(counter)); + } else { + /* + * Don't let events eat all the space. Leaving 30 bytes + * for the rest should be enough. + */ + size_t last_pos = size - 30; + + list_for_each_entry(counter, &top->evlist->entries, node) { + ret += SNPRINTF(bf + ret, size - ret, "%s%s", + counter->idx ? "/" : "", + event_name(counter)); + if (ret > last_pos) { + sprintf(bf + last_pos - 3, ".."); + ret = last_pos - 1; + break; + } + } } ret += SNPRINTF(bf + ret, size - ret, "], "); From a91e5431d54f5359fccb5ec2512f252eb217707e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 10 Mar 2011 11:15:54 -0300 Subject: [PATCH 577/706] perf session: Use evlist/evsel for managing perf.data attributes So that we can reuse things like the id to attr lookup routine (perf_evlist__id2evsel) that uses a hash table instead of the linear lookup done in the older perf_header_attr routines, etc. Also to make evsels/evlist more pervasive an API, simplyfing using the emerging perf lib. cc: Arun Sharma Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 101 ++++++---------- tools/perf/builtin-report.c | 4 +- tools/perf/builtin-top.c | 5 +- tools/perf/util/evlist.c | 26 +++-- tools/perf/util/evlist.h | 4 +- tools/perf/util/evsel.c | 21 +++- tools/perf/util/evsel.h | 9 +- tools/perf/util/header.c | 225 ++++++++++++++---------------------- tools/perf/util/header.h | 36 ++---- tools/perf/util/session.c | 74 +----------- tools/perf/util/session.h | 2 - 11 files changed, 179 insertions(+), 328 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index d40a81e8cc56..6febcc168a8c 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -31,7 +31,6 @@ #include #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y)) -#define SID(e, x, y) xyarray__entry(e->id, x, y) enum write_mode_t { WRITE_FORCE, @@ -40,7 +39,6 @@ enum write_mode_t { static u64 user_interval = ULLONG_MAX; static u64 default_interval = 0; -static u64 sample_type; static unsigned int page_size; static unsigned int mmap_pages = 128; @@ -160,54 +158,6 @@ static void sig_atexit(void) kill(getpid(), signr); } -static struct perf_header_attr *get_header_attr(struct perf_event_attr *a, int nr) -{ - struct perf_header_attr *h_attr; - - if (nr < session->header.attrs) { - h_attr = session->header.attr[nr]; - } else { - h_attr = perf_header_attr__new(a); - if (h_attr != NULL) - if (perf_header__add_attr(&session->header, h_attr) < 0) { - perf_header_attr__delete(h_attr); - h_attr = NULL; - } - } - - return h_attr; -} - -static void create_counter(struct perf_evsel *evsel, int cpu) -{ - struct perf_event_attr *attr = &evsel->attr; - struct perf_header_attr *h_attr; - struct perf_sample_id *sid; - int thread_index; - - for (thread_index = 0; thread_index < evsel_list->threads->nr; thread_index++) { - h_attr = get_header_attr(attr, evsel->idx); - if (h_attr == NULL) - die("nomem\n"); - - if (!file_new) { - if (memcmp(&h_attr->attr, attr, sizeof(*attr))) { - fprintf(stderr, "incompatible append\n"); - exit(-1); - } - } - - sid = SID(evsel, cpu, thread_index); - if (perf_header_attr__add_id(h_attr, sid->id) < 0) { - pr_warning("Not enough memory to add id\n"); - exit(-1); - } - } - - if (!sample_type) - sample_type = attr->sample_type; -} - static void config_attr(struct perf_evsel *evsel, struct perf_evlist *evlist) { struct perf_event_attr *attr = &evsel->attr; @@ -278,10 +228,28 @@ static void config_attr(struct perf_evsel *evsel, struct perf_evlist *evlist) } } +static bool perf_evlist__equal(struct perf_evlist *evlist, + struct perf_evlist *other) +{ + struct perf_evsel *pos, *pair; + + if (evlist->nr_entries != other->nr_entries) + return false; + + pair = list_entry(other->entries.next, struct perf_evsel, node); + + list_for_each_entry(pos, &evlist->entries, node) { + if (memcmp(&pos->attr, &pair->attr, sizeof(pos->attr) != 0)) + return false; + pair = list_entry(pair->node.next, struct perf_evsel, node); + } + + return true; +} + static void open_counters(struct perf_evlist *evlist) { struct perf_evsel *pos; - int cpu; list_for_each_entry(pos, &evlist->entries, node) { struct perf_event_attr *attr = &pos->attr; @@ -364,10 +332,16 @@ try_again: if (perf_evlist__mmap(evlist, mmap_pages, false) < 0) die("failed to mmap with %d (%s)\n", errno, strerror(errno)); - for (cpu = 0; cpu < evsel_list->cpus->nr; ++cpu) { - list_for_each_entry(pos, &evlist->entries, node) - create_counter(pos, cpu); - } + if (file_new) + session->evlist = evlist; + else { + if (!perf_evlist__equal(session->evlist, evlist)) { + fprintf(stderr, "incompatible append\n"); + exit(-1); + } + } + + perf_session__update_sample_type(session); } static int process_buildids(void) @@ -390,7 +364,7 @@ static void atexit_header(void) if (!no_buildid) process_buildids(); - perf_header__write(&session->header, evsel_list, output, true); + perf_session__write_header(session, evsel_list, output, true); perf_session__delete(session); perf_evlist__delete(evsel_list); symbol__exit(); @@ -524,7 +498,7 @@ static int __cmd_record(int argc, const char **argv) perf_header__set_feat(&session->header, HEADER_BUILD_ID); if (!file_new) { - err = perf_header__read(session, output); + err = perf_session__read_header(session, output); if (err < 0) goto out_delete_session; } @@ -588,8 +562,6 @@ static int __cmd_record(int argc, const char **argv) open_counters(evsel_list); - perf_session__set_sample_type(session, sample_type); - /* * perf_session__delete(session) will be called at atexit_header() */ @@ -600,20 +572,17 @@ static int __cmd_record(int argc, const char **argv) if (err < 0) return err; } else if (file_new) { - err = perf_header__write(&session->header, evsel_list, - output, false); + err = perf_session__write_header(session, evsel_list, + output, false); if (err < 0) return err; } post_processing_offset = lseek(output, 0, SEEK_CUR); - perf_session__set_sample_id_all(session, sample_id_all_avail); - if (pipe_output) { - err = perf_event__synthesize_attrs(&session->header, - process_synthesized_event, - session); + err = perf_session__synthesize_attrs(session, + process_synthesized_event); if (err < 0) { pr_err("Couldn't synthesize attrs.\n"); return err; diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index e9b5d513333a..b1b82009ab9b 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -70,8 +70,8 @@ static int perf_session__add_hist_entry(struct perf_session *session, * FIXME: Propagate this back, but at least we're in a builtin, * where exit() is allowed. ;-) */ - ui__warning("Invalid %s file, contains samples with id not in " - "its header!\n", input_name); + ui__warning("Invalid %s file, contains samples with id %" PRIu64 " not in " + "its header!\n", input_name, sample->id); exit_browser(0); exit(1); } diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 417f757e3cbe..80c9e062bd5b 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -883,7 +883,6 @@ try_again: static int __cmd_top(void) { pthread_t thread; - struct perf_evsel *first; int ret __used; /* * FIXME: perf_session__new should allow passing a O_MMAP, so that all this @@ -900,8 +899,8 @@ static int __cmd_top(void) perf_event__synthesize_threads(perf_event__process, session); start_counters(top.evlist); - first = list_entry(top.evlist->entries.next, struct perf_evsel, node); - perf_session__set_sample_type(session, first->attr.sample_type); + session->evlist = top.evlist; + perf_session__update_sample_type(session); /* Wait for a minimal set of events before starting the snapshot */ poll(top.evlist->pollfd, top.evlist->nr_fds, 100); diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 190c64c6e266..d852cefa20de 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -19,7 +19,7 @@ #include #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y)) -#define SID(e, x, y) xyarray__entry(e->id, x, y) +#define SID(e, x, y) xyarray__entry(e->sample_id, x, y) void perf_evlist__init(struct perf_evlist *evlist, struct cpu_map *cpus, struct thread_map *threads) @@ -106,8 +106,9 @@ void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd) evlist->nr_fds++; } -void perf_evlist__id_hash(struct perf_evlist *evlist, struct perf_evsel *evsel, - int cpu, int thread, u64 id) +static void perf_evlist__id_hash(struct perf_evlist *evlist, + struct perf_evsel *evsel, + int cpu, int thread, u64 id) { int hash; struct perf_sample_id *sid = SID(evsel, cpu, thread); @@ -118,9 +119,16 @@ void perf_evlist__id_hash(struct perf_evlist *evlist, struct perf_evsel *evsel, hlist_add_head(&sid->node, &evlist->heads[hash]); } -static int perf_evlist__id_hash_fd(struct perf_evlist *evlist, - struct perf_evsel *evsel, - int cpu, int thread, int fd) +void perf_evlist__id_add(struct perf_evlist *evlist, struct perf_evsel *evsel, + int cpu, int thread, u64 id) +{ + perf_evlist__id_hash(evlist, evsel, cpu, thread, id); + evsel->id[evsel->ids++] = id; +} + +static int perf_evlist__id_add_fd(struct perf_evlist *evlist, + struct perf_evsel *evsel, + int cpu, int thread, int fd) { u64 read_data[4] = { 0, }; int id_idx = 1; /* The first entry is the counter value */ @@ -134,7 +142,7 @@ static int perf_evlist__id_hash_fd(struct perf_evlist *evlist, if (evsel->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) ++id_idx; - perf_evlist__id_hash(evlist, evsel, cpu, thread, read_data[id_idx]); + perf_evlist__id_add(evlist, evsel, cpu, thread, read_data[id_idx]); return 0; } @@ -292,7 +300,7 @@ int perf_evlist__mmap(struct perf_evlist *evlist, int pages, bool overwrite) list_for_each_entry(evsel, &evlist->entries, node) { if ((evsel->attr.read_format & PERF_FORMAT_ID) && - evsel->id == NULL && + evsel->sample_id == NULL && perf_evsel__alloc_id(evsel, cpus->nr, threads->nr) < 0) return -ENOMEM; @@ -308,7 +316,7 @@ int perf_evlist__mmap(struct perf_evlist *evlist, int pages, bool overwrite) goto out_unmap; if ((evsel->attr.read_format & PERF_FORMAT_ID) && - perf_evlist__id_hash_fd(evlist, evsel, cpu, thread, fd) < 0) + perf_evlist__id_add_fd(evlist, evsel, cpu, thread, fd) < 0) goto out_unmap; } } diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 078d51256085..8b1cb7a4c5f1 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -38,8 +38,8 @@ void perf_evlist__delete(struct perf_evlist *evlist); void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry); int perf_evlist__add_default(struct perf_evlist *evlist); -void perf_evlist__id_hash(struct perf_evlist *evlist, struct perf_evsel *evsel, - int cpu, int thread, u64 id); +void perf_evlist__id_add(struct perf_evlist *evlist, struct perf_evsel *evsel, + int cpu, int thread, u64 id); int perf_evlist__alloc_pollfd(struct perf_evlist *evlist); void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd); diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 8083d5126fca..662596afd7f1 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -41,8 +41,18 @@ int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads) int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads) { - evsel->id = xyarray__new(ncpus, nthreads, sizeof(struct perf_sample_id)); - return evsel->id != NULL ? 0 : -ENOMEM; + evsel->sample_id = xyarray__new(ncpus, nthreads, sizeof(struct perf_sample_id)); + if (evsel->sample_id == NULL) + return -ENOMEM; + + evsel->id = zalloc(ncpus * nthreads * sizeof(u64)); + if (evsel->id == NULL) { + xyarray__delete(evsel->sample_id); + evsel->sample_id = NULL; + return -ENOMEM; + } + + return 0; } int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus) @@ -60,7 +70,9 @@ void perf_evsel__free_fd(struct perf_evsel *evsel) void perf_evsel__free_id(struct perf_evsel *evsel) { - xyarray__delete(evsel->id); + xyarray__delete(evsel->sample_id); + evsel->sample_id = NULL; + free(evsel->id); evsel->id = NULL; } @@ -79,7 +91,8 @@ void perf_evsel__exit(struct perf_evsel *evsel) { assert(list_empty(&evsel->node)); xyarray__delete(evsel->fd); - xyarray__delete(evsel->id); + xyarray__delete(evsel->sample_id); + free(evsel->id); } void perf_evsel__delete(struct perf_evsel *evsel) diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 281b60e5fc7b..6710ab538342 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -49,12 +49,17 @@ struct perf_evsel { struct perf_event_attr attr; char *filter; struct xyarray *fd; - struct xyarray *id; + struct xyarray *sample_id; + u64 *id; struct perf_counts *counts; int idx; + int ids; struct hists hists; char *name; - void *priv; + union { + void *priv; + off_t id_offset; + }; struct cgroup_sel *cgrp; }; diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 108b0db7bbef..40b10e4d3927 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -9,6 +9,7 @@ #include #include "evlist.h" +#include "evsel.h" #include "util.h" #include "header.h" #include "../perf.h" @@ -19,89 +20,6 @@ static bool no_buildid_cache = false; -/* - * Create new perf.data header attribute: - */ -struct perf_header_attr *perf_header_attr__new(struct perf_event_attr *attr) -{ - struct perf_header_attr *self = malloc(sizeof(*self)); - - if (self != NULL) { - self->attr = *attr; - self->ids = 0; - self->size = 1; - self->id = malloc(sizeof(u64)); - if (self->id == NULL) { - free(self); - self = NULL; - } - } - - return self; -} - -void perf_header_attr__delete(struct perf_header_attr *self) -{ - free(self->id); - free(self); -} - -int perf_header_attr__add_id(struct perf_header_attr *self, u64 id) -{ - int pos = self->ids; - - self->ids++; - if (self->ids > self->size) { - int nsize = self->size * 2; - u64 *nid = realloc(self->id, nsize * sizeof(u64)); - - if (nid == NULL) - return -1; - - self->size = nsize; - self->id = nid; - } - self->id[pos] = id; - return 0; -} - -int perf_header__init(struct perf_header *self) -{ - self->size = 1; - self->attr = malloc(sizeof(void *)); - return self->attr == NULL ? -ENOMEM : 0; -} - -void perf_header__exit(struct perf_header *self) -{ - int i; - for (i = 0; i < self->attrs; ++i) - perf_header_attr__delete(self->attr[i]); - free(self->attr); -} - -int perf_header__add_attr(struct perf_header *self, - struct perf_header_attr *attr) -{ - if (self->frozen) - return -1; - - if (self->attrs == self->size) { - int nsize = self->size * 2; - struct perf_header_attr **nattr; - - nattr = realloc(self->attr, nsize * sizeof(void *)); - if (nattr == NULL) - return -1; - - self->size = nsize; - self->attr = nattr; - } - - self->attr[self->attrs++] = attr; - return 0; -} - static int event_count; static struct perf_trace_event_type *events; @@ -515,33 +433,41 @@ int perf_header__write_pipe(int fd) return 0; } -int perf_header__write(struct perf_header *self, struct perf_evlist *evlist, - int fd, bool at_exit) +int perf_session__write_header(struct perf_session *session, + struct perf_evlist *evlist, + int fd, bool at_exit) { struct perf_file_header f_header; struct perf_file_attr f_attr; - struct perf_header_attr *attr; - int i, err; + struct perf_header *self = &session->header; + struct perf_evsel *attr, *pair = NULL; + int err; lseek(fd, sizeof(f_header), SEEK_SET); - for (i = 0; i < self->attrs; i++) { - attr = self->attr[i]; + if (session->evlist != evlist) + pair = list_entry(session->evlist->entries.next, struct perf_evsel, node); + list_for_each_entry(attr, &evlist->entries, node) { attr->id_offset = lseek(fd, 0, SEEK_CUR); err = do_write(fd, attr->id, attr->ids * sizeof(u64)); if (err < 0) { +out_err_write: pr_debug("failed to write perf header\n"); return err; } + if (session->evlist != evlist) { + err = do_write(fd, pair->id, pair->ids * sizeof(u64)); + if (err < 0) + goto out_err_write; + attr->ids += pair->ids; + pair = list_entry(pair->node.next, struct perf_evsel, node); + } } - self->attr_offset = lseek(fd, 0, SEEK_CUR); - for (i = 0; i < self->attrs; i++) { - attr = self->attr[i]; - + list_for_each_entry(attr, &evlist->entries, node) { f_attr = (struct perf_file_attr){ .attr = attr->attr, .ids = { @@ -580,7 +506,7 @@ int perf_header__write(struct perf_header *self, struct perf_evlist *evlist, .attr_size = sizeof(f_attr), .attrs = { .offset = self->attr_offset, - .size = self->attrs * sizeof(f_attr), + .size = evlist->nr_entries * sizeof(f_attr), }, .data = { .offset = self->data_offset, @@ -861,7 +787,7 @@ static int perf_header__read_pipe(struct perf_session *session, int fd) return 0; } -int perf_header__read(struct perf_session *session, int fd) +int perf_session__read_header(struct perf_session *session, int fd) { struct perf_header *self = &session->header; struct perf_file_header f_header; @@ -869,6 +795,10 @@ int perf_header__read(struct perf_session *session, int fd) u64 f_id; int nr_attrs, nr_ids, i, j; + session->evlist = perf_evlist__new(NULL, NULL); + if (session->evlist == NULL) + return -ENOMEM; + if (session->fd_pipe) return perf_header__read_pipe(session, fd); @@ -881,33 +811,39 @@ int perf_header__read(struct perf_session *session, int fd) lseek(fd, f_header.attrs.offset, SEEK_SET); for (i = 0; i < nr_attrs; i++) { - struct perf_header_attr *attr; + struct perf_evsel *evsel; off_t tmp; if (perf_header__getbuffer64(self, fd, &f_attr, sizeof(f_attr))) goto out_errno; tmp = lseek(fd, 0, SEEK_CUR); + evsel = perf_evsel__new(&f_attr.attr, i); - attr = perf_header_attr__new(&f_attr.attr); - if (attr == NULL) - return -ENOMEM; + if (evsel == NULL) + goto out_delete_evlist; + /* + * Do it before so that if perf_evsel__alloc_id fails, this + * entry gets purged too at perf_evlist__delete(). + */ + perf_evlist__add(session->evlist, evsel); nr_ids = f_attr.ids.size / sizeof(u64); + /* + * We don't have the cpu and thread maps on the header, so + * for allocating the perf_sample_id table we fake 1 cpu and + * hattr->ids threads. + */ + if (perf_evsel__alloc_id(evsel, 1, nr_ids)) + goto out_delete_evlist; + lseek(fd, f_attr.ids.offset, SEEK_SET); for (j = 0; j < nr_ids; j++) { if (perf_header__getbuffer64(self, fd, &f_id, sizeof(f_id))) goto out_errno; - if (perf_header_attr__add_id(attr, f_id) < 0) { - perf_header_attr__delete(attr); - return -ENOMEM; - } - } - if (perf_header__add_attr(self, attr) < 0) { - perf_header_attr__delete(attr); - return -ENOMEM; + perf_evlist__id_add(session->evlist, evsel, 0, j, f_id); } lseek(fd, tmp, SEEK_SET); @@ -932,37 +868,38 @@ int perf_header__read(struct perf_session *session, int fd) return 0; out_errno: return -errno; + +out_delete_evlist: + perf_evlist__delete(session->evlist); + session->evlist = NULL; + return -ENOMEM; } -u64 perf_header__sample_type(struct perf_header *header) +u64 perf_evlist__sample_type(struct perf_evlist *evlist) { + struct perf_evsel *pos; u64 type = 0; - int i; - - for (i = 0; i < header->attrs; i++) { - struct perf_header_attr *attr = header->attr[i]; + list_for_each_entry(pos, &evlist->entries, node) { if (!type) - type = attr->attr.sample_type; - else if (type != attr->attr.sample_type) + type = pos->attr.sample_type; + else if (type != pos->attr.sample_type) die("non matching sample_type"); } return type; } -bool perf_header__sample_id_all(const struct perf_header *header) +bool perf_evlist__sample_id_all(const struct perf_evlist *evlist) { bool value = false, first = true; - int i; - - for (i = 0; i < header->attrs; i++) { - struct perf_header_attr *attr = header->attr[i]; + struct perf_evsel *pos; + list_for_each_entry(pos, &evlist->entries, node) { if (first) { - value = attr->attr.sample_id_all; + value = pos->attr.sample_id_all; first = false; - } else if (value != attr->attr.sample_id_all) + } else if (value != pos->attr.sample_id_all) die("non matching sample_id_all"); } @@ -1000,16 +937,13 @@ int perf_event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id, return err; } -int perf_event__synthesize_attrs(struct perf_header *self, - perf_event__handler_t process, - struct perf_session *session) +int perf_session__synthesize_attrs(struct perf_session *session, + perf_event__handler_t process) { - struct perf_header_attr *attr; - int i, err = 0; - - for (i = 0; i < self->attrs; i++) { - attr = self->attr[i]; + struct perf_evsel *attr; + int err = 0; + list_for_each_entry(attr, &session->evlist->entries, node) { err = perf_event__synthesize_attr(&attr->attr, attr->ids, attr->id, process, session); if (err) { @@ -1024,27 +958,36 @@ int perf_event__synthesize_attrs(struct perf_header *self, int perf_event__process_attr(union perf_event *event, struct perf_session *session) { - struct perf_header_attr *attr; unsigned int i, ids, n_ids; + struct perf_evsel *evsel; - attr = perf_header_attr__new(&event->attr.attr); - if (attr == NULL) + if (session->evlist == NULL) { + session->evlist = perf_evlist__new(NULL, NULL); + if (session->evlist == NULL) + return -ENOMEM; + } + + evsel = perf_evsel__new(&event->attr.attr, + session->evlist->nr_entries); + if (evsel == NULL) return -ENOMEM; + perf_evlist__add(session->evlist, evsel); + ids = event->header.size; ids -= (void *)&event->attr.id - (void *)event; n_ids = ids / sizeof(u64); + /* + * We don't have the cpu and thread maps on the header, so + * for allocating the perf_sample_id table we fake 1 cpu and + * hattr->ids threads. + */ + if (perf_evsel__alloc_id(evsel, 1, n_ids)) + return -ENOMEM; for (i = 0; i < n_ids; i++) { - if (perf_header_attr__add_id(attr, event->attr.id[i]) < 0) { - perf_header_attr__delete(attr); - return -ENOMEM; - } - } - - if (perf_header__add_attr(&session->header, attr) < 0) { - perf_header_attr__delete(attr); - return -ENOMEM; + perf_evlist__id_add(session->evlist, evsel, 0, i, + event->attr.id[i]); } perf_session__update_sample_type(session); diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index 2fab13348aab..4cc267559bb6 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -9,13 +9,6 @@ #include -struct perf_header_attr { - struct perf_event_attr attr; - int ids, size; - u64 *id; - off_t id_offset; -}; - enum { HEADER_TRACE_INFO = 1, HEADER_BUILD_ID, @@ -51,9 +44,7 @@ int perf_file_header__read(struct perf_file_header *self, struct perf_header { int frozen; - int attrs, size; bool needs_swap; - struct perf_header_attr **attr; s64 attr_offset; u64 data_offset; u64 data_size; @@ -62,29 +53,19 @@ struct perf_header { DECLARE_BITMAP(adds_features, HEADER_FEAT_BITS); }; -int perf_header__init(struct perf_header *self); -void perf_header__exit(struct perf_header *self); - struct perf_evlist; -int perf_header__read(struct perf_session *session, int fd); -int perf_header__write(struct perf_header *self, struct perf_evlist *evlist, - int fd, bool at_exit); +int perf_session__read_header(struct perf_session *session, int fd); +int perf_session__write_header(struct perf_session *session, + struct perf_evlist *evlist, + int fd, bool at_exit); int perf_header__write_pipe(int fd); -int perf_header__add_attr(struct perf_header *self, - struct perf_header_attr *attr); - int perf_header__push_event(u64 id, const char *name); char *perf_header__find_event(u64 id); -struct perf_header_attr *perf_header_attr__new(struct perf_event_attr *attr); -void perf_header_attr__delete(struct perf_header_attr *self); - -int perf_header_attr__add_id(struct perf_header_attr *self, u64 id); - -u64 perf_header__sample_type(struct perf_header *header); -bool perf_header__sample_id_all(const struct perf_header *header); +u64 perf_evlist__sample_type(struct perf_evlist *evlist); +bool perf_evlist__sample_id_all(const struct perf_evlist *evlist); void perf_header__set_feat(struct perf_header *self, int feat); void perf_header__clear_feat(struct perf_header *self, int feat); bool perf_header__has_feat(const struct perf_header *self, int feat); @@ -101,9 +82,8 @@ int build_id_cache__remove_s(const char *sbuild_id, const char *debugdir); int perf_event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id, perf_event__handler_t process, struct perf_session *session); -int perf_event__synthesize_attrs(struct perf_header *self, - perf_event__handler_t process, - struct perf_session *session); +int perf_session__synthesize_attrs(struct perf_session *session, + perf_event__handler_t process); int perf_event__process_attr(union perf_event *event, struct perf_session *session); int perf_event__synthesize_event_type(u64 event_id, char *name, diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 0d414199889d..f26639fa0fb3 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -13,46 +13,6 @@ #include "sort.h" #include "util.h" -static int perf_session__read_evlist(struct perf_session *session) -{ - int i, j; - - session->evlist = perf_evlist__new(NULL, NULL); - if (session->evlist == NULL) - return -ENOMEM; - - for (i = 0; i < session->header.attrs; ++i) { - struct perf_header_attr *hattr = session->header.attr[i]; - struct perf_evsel *evsel = perf_evsel__new(&hattr->attr, i); - - if (evsel == NULL) - goto out_delete_evlist; - /* - * Do it before so that if perf_evsel__alloc_id fails, this - * entry gets purged too at perf_evlist__delete(). - */ - perf_evlist__add(session->evlist, evsel); - /* - * We don't have the cpu and thread maps on the header, so - * for allocating the perf_sample_id table we fake 1 cpu and - * hattr->ids threads. - */ - if (perf_evsel__alloc_id(evsel, 1, hattr->ids)) - goto out_delete_evlist; - - for (j = 0; j < hattr->ids; ++j) - perf_evlist__id_hash(session->evlist, evsel, 0, j, - hattr->id[j]); - } - - return 0; - -out_delete_evlist: - perf_evlist__delete(session->evlist); - session->evlist = NULL; - return -ENOMEM; -} - static int perf_session__open(struct perf_session *self, bool force) { struct stat input_stat; @@ -61,7 +21,7 @@ static int perf_session__open(struct perf_session *self, bool force) self->fd_pipe = true; self->fd = STDIN_FILENO; - if (perf_header__read(self, self->fd) < 0) + if (perf_session__read_header(self, self->fd) < 0) pr_err("incompatible file format"); return 0; @@ -93,16 +53,11 @@ static int perf_session__open(struct perf_session *self, bool force) goto out_close; } - if (perf_header__read(self, self->fd) < 0) { + if (perf_session__read_header(self, self->fd) < 0) { pr_err("incompatible file format"); goto out_close; } - if (perf_session__read_evlist(self) < 0) { - pr_err("Not enough memory to read the event selector list\n"); - goto out_close; - } - self->size = input_stat.st_size; return 0; @@ -139,21 +94,10 @@ out: session->id_hdr_size = size; } -void perf_session__set_sample_id_all(struct perf_session *session, bool value) -{ - session->sample_id_all = value; - perf_session__id_header_size(session); -} - -void perf_session__set_sample_type(struct perf_session *session, u64 type) -{ - session->sample_type = type; -} - void perf_session__update_sample_type(struct perf_session *self) { - self->sample_type = perf_header__sample_type(&self->header); - self->sample_id_all = perf_header__sample_id_all(&self->header); + self->sample_type = perf_evlist__sample_type(self->evlist); + self->sample_id_all = perf_evlist__sample_id_all(self->evlist); perf_session__id_header_size(self); } @@ -182,9 +126,6 @@ struct perf_session *perf_session__new(const char *filename, int mode, if (self == NULL) goto out; - if (perf_header__init(&self->header) < 0) - goto out_free; - memcpy(self->filename, filename, len); self->threads = RB_ROOT; INIT_LIST_HEAD(&self->dead_threads); @@ -208,6 +149,7 @@ struct perf_session *perf_session__new(const char *filename, int mode, if (mode == O_RDONLY) { if (perf_session__open(self, force) < 0) goto out_delete; + perf_session__update_sample_type(self); } else if (mode == O_WRONLY) { /* * In O_RDONLY mode this will be performed when reading the @@ -217,8 +159,6 @@ struct perf_session *perf_session__new(const char *filename, int mode, goto out_delete; } - perf_session__update_sample_type(self); - if (ops && ops->ordering_requires_timestamps && ops->ordered_samples && !self->sample_id_all) { dump_printf("WARNING: No sample_id_all support, falling back to unordered processing\n"); @@ -227,9 +167,6 @@ struct perf_session *perf_session__new(const char *filename, int mode, out: return self; -out_free: - free(self); - return NULL; out_delete: perf_session__delete(self); return NULL; @@ -260,7 +197,6 @@ static void perf_session__delete_threads(struct perf_session *self) void perf_session__delete(struct perf_session *self) { - perf_header__exit(&self->header); perf_session__destroy_kernel_maps(self); perf_session__delete_dead_threads(self); perf_session__delete_threads(self); diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index 05dd7bcb9453..b5b148b0aaca 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -112,8 +112,6 @@ void mem_bswap_64(void *src, int byte_size); int perf_session__create_kernel_maps(struct perf_session *self); void perf_session__update_sample_type(struct perf_session *self); -void perf_session__set_sample_id_all(struct perf_session *session, bool value); -void perf_session__set_sample_type(struct perf_session *session, u64 type); void perf_session__remove_thread(struct perf_session *self, struct thread *th); static inline From 1c0b04d10bbe35279c50e3b36cf5b8ec2a0050d8 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 9 Mar 2011 08:13:19 -0300 Subject: [PATCH 578/706] perf header: Stop using 'self' Stop using this python/OOP convention, doesn't really helps. Will do more from time to time till we get it cleaned up in all of tools/perf. Suggested-by: Thomas Gleixner LKML-Reference: Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Tom Zanussi Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/header.c | 199 +++++++++++++++++++-------------------- tools/perf/util/header.h | 12 +-- 2 files changed, 105 insertions(+), 106 deletions(-) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 40b10e4d3927..5a72d421e211 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -66,19 +66,19 @@ struct perf_file_attr { struct perf_file_section ids; }; -void perf_header__set_feat(struct perf_header *self, int feat) +void perf_header__set_feat(struct perf_header *header, int feat) { - set_bit(feat, self->adds_features); + set_bit(feat, header->adds_features); } -void perf_header__clear_feat(struct perf_header *self, int feat) +void perf_header__clear_feat(struct perf_header *header, int feat) { - clear_bit(feat, self->adds_features); + clear_bit(feat, header->adds_features); } -bool perf_header__has_feat(const struct perf_header *self, int feat) +bool perf_header__has_feat(const struct perf_header *header, int feat) { - return test_bit(feat, self->adds_features); + return test_bit(feat, header->adds_features); } static int do_write(int fd, const void *buf, size_t size) @@ -147,22 +147,22 @@ static int __dsos__write_buildid_table(struct list_head *head, pid_t pid, return 0; } -static int machine__write_buildid_table(struct machine *self, int fd) +static int machine__write_buildid_table(struct machine *machine, int fd) { int err; u16 kmisc = PERF_RECORD_MISC_KERNEL, umisc = PERF_RECORD_MISC_USER; - if (!machine__is_host(self)) { + if (!machine__is_host(machine)) { kmisc = PERF_RECORD_MISC_GUEST_KERNEL; umisc = PERF_RECORD_MISC_GUEST_USER; } - err = __dsos__write_buildid_table(&self->kernel_dsos, self->pid, + err = __dsos__write_buildid_table(&machine->kernel_dsos, machine->pid, kmisc, fd); if (err == 0) - err = __dsos__write_buildid_table(&self->user_dsos, - self->pid, umisc, fd); + err = __dsos__write_buildid_table(&machine->user_dsos, + machine->pid, umisc, fd); return err; } @@ -280,12 +280,12 @@ out_free: return err; } -static int dso__cache_build_id(struct dso *self, const char *debugdir) +static int dso__cache_build_id(struct dso *dso, const char *debugdir) { - bool is_kallsyms = self->kernel && self->long_name[0] != '/'; + bool is_kallsyms = dso->kernel && dso->long_name[0] != '/'; - return build_id_cache__add_b(self->build_id, sizeof(self->build_id), - self->long_name, debugdir, is_kallsyms); + return build_id_cache__add_b(dso->build_id, sizeof(dso->build_id), + dso->long_name, debugdir, is_kallsyms); } static int __dsos__cache_build_ids(struct list_head *head, const char *debugdir) @@ -300,14 +300,14 @@ static int __dsos__cache_build_ids(struct list_head *head, const char *debugdir) return err; } -static int machine__cache_build_ids(struct machine *self, const char *debugdir) +static int machine__cache_build_ids(struct machine *machine, const char *debugdir) { - int ret = __dsos__cache_build_ids(&self->kernel_dsos, debugdir); - ret |= __dsos__cache_build_ids(&self->user_dsos, debugdir); + int ret = __dsos__cache_build_ids(&machine->kernel_dsos, debugdir); + ret |= __dsos__cache_build_ids(&machine->user_dsos, debugdir); return ret; } -static int perf_session__cache_build_ids(struct perf_session *self) +static int perf_session__cache_build_ids(struct perf_session *session) { struct rb_node *nd; int ret; @@ -318,28 +318,28 @@ static int perf_session__cache_build_ids(struct perf_session *self) if (mkdir(debugdir, 0755) != 0 && errno != EEXIST) return -1; - ret = machine__cache_build_ids(&self->host_machine, debugdir); + ret = machine__cache_build_ids(&session->host_machine, debugdir); - for (nd = rb_first(&self->machines); nd; nd = rb_next(nd)) { + for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) { struct machine *pos = rb_entry(nd, struct machine, rb_node); ret |= machine__cache_build_ids(pos, debugdir); } return ret ? -1 : 0; } -static bool machine__read_build_ids(struct machine *self, bool with_hits) +static bool machine__read_build_ids(struct machine *machine, bool with_hits) { - bool ret = __dsos__read_build_ids(&self->kernel_dsos, with_hits); - ret |= __dsos__read_build_ids(&self->user_dsos, with_hits); + bool ret = __dsos__read_build_ids(&machine->kernel_dsos, with_hits); + ret |= __dsos__read_build_ids(&machine->user_dsos, with_hits); return ret; } -static bool perf_session__read_build_ids(struct perf_session *self, bool with_hits) +static bool perf_session__read_build_ids(struct perf_session *session, bool with_hits) { struct rb_node *nd; - bool ret = machine__read_build_ids(&self->host_machine, with_hits); + bool ret = machine__read_build_ids(&session->host_machine, with_hits); - for (nd = rb_first(&self->machines); nd; nd = rb_next(nd)) { + for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) { struct machine *pos = rb_entry(nd, struct machine, rb_node); ret |= machine__read_build_ids(pos, with_hits); } @@ -347,7 +347,7 @@ static bool perf_session__read_build_ids(struct perf_session *self, bool with_hi return ret; } -static int perf_header__adds_write(struct perf_header *self, +static int perf_header__adds_write(struct perf_header *header, struct perf_evlist *evlist, int fd) { int nr_sections; @@ -357,13 +357,13 @@ static int perf_header__adds_write(struct perf_header *self, u64 sec_start; int idx = 0, err; - session = container_of(self, struct perf_session, header); + session = container_of(header, struct perf_session, header); - if (perf_header__has_feat(self, HEADER_BUILD_ID && + if (perf_header__has_feat(header, HEADER_BUILD_ID && !perf_session__read_build_ids(session, true))) - perf_header__clear_feat(self, HEADER_BUILD_ID); + perf_header__clear_feat(header, HEADER_BUILD_ID); - nr_sections = bitmap_weight(self->adds_features, HEADER_FEAT_BITS); + nr_sections = bitmap_weight(header->adds_features, HEADER_FEAT_BITS); if (!nr_sections) return 0; @@ -373,10 +373,10 @@ static int perf_header__adds_write(struct perf_header *self, sec_size = sizeof(*feat_sec) * nr_sections; - sec_start = self->data_offset + self->data_size; + sec_start = header->data_offset + header->data_size; lseek(fd, sec_start + sec_size, SEEK_SET); - if (perf_header__has_feat(self, HEADER_TRACE_INFO)) { + if (perf_header__has_feat(header, HEADER_TRACE_INFO)) { struct perf_file_section *trace_sec; trace_sec = &feat_sec[idx++]; @@ -387,14 +387,14 @@ static int perf_header__adds_write(struct perf_header *self, trace_sec->size = lseek(fd, 0, SEEK_CUR) - trace_sec->offset; } - if (perf_header__has_feat(self, HEADER_BUILD_ID)) { + if (perf_header__has_feat(header, HEADER_BUILD_ID)) { struct perf_file_section *buildid_sec; buildid_sec = &feat_sec[idx++]; /* Write build-ids */ buildid_sec->offset = lseek(fd, 0, SEEK_CUR); - err = dsos__write_buildid_table(self, fd); + err = dsos__write_buildid_table(header, fd); if (err < 0) { pr_debug("failed to write buildid table\n"); goto out_free; @@ -439,7 +439,7 @@ int perf_session__write_header(struct perf_session *session, { struct perf_file_header f_header; struct perf_file_attr f_attr; - struct perf_header *self = &session->header; + struct perf_header *header = &session->header; struct perf_evsel *attr, *pair = NULL; int err; @@ -465,7 +465,7 @@ out_err_write: } } - self->attr_offset = lseek(fd, 0, SEEK_CUR); + header->attr_offset = lseek(fd, 0, SEEK_CUR); list_for_each_entry(attr, &evlist->entries, node) { f_attr = (struct perf_file_attr){ @@ -482,20 +482,20 @@ out_err_write: } } - self->event_offset = lseek(fd, 0, SEEK_CUR); - self->event_size = event_count * sizeof(struct perf_trace_event_type); + header->event_offset = lseek(fd, 0, SEEK_CUR); + header->event_size = event_count * sizeof(struct perf_trace_event_type); if (events) { - err = do_write(fd, events, self->event_size); + err = do_write(fd, events, header->event_size); if (err < 0) { pr_debug("failed to write perf header events\n"); return err; } } - self->data_offset = lseek(fd, 0, SEEK_CUR); + header->data_offset = lseek(fd, 0, SEEK_CUR); if (at_exit) { - err = perf_header__adds_write(self, evlist, fd); + err = perf_header__adds_write(header, evlist, fd); if (err < 0) return err; } @@ -505,20 +505,20 @@ out_err_write: .size = sizeof(f_header), .attr_size = sizeof(f_attr), .attrs = { - .offset = self->attr_offset, + .offset = header->attr_offset, .size = evlist->nr_entries * sizeof(f_attr), }, .data = { - .offset = self->data_offset, - .size = self->data_size, + .offset = header->data_offset, + .size = header->data_size, }, .event_types = { - .offset = self->event_offset, - .size = self->event_size, + .offset = header->event_offset, + .size = header->event_size, }, }; - memcpy(&f_header.adds_features, &self->adds_features, sizeof(self->adds_features)); + memcpy(&f_header.adds_features, &header->adds_features, sizeof(header->adds_features)); lseek(fd, 0, SEEK_SET); err = do_write(fd, &f_header, sizeof(f_header)); @@ -526,26 +526,26 @@ out_err_write: pr_debug("failed to write perf header\n"); return err; } - lseek(fd, self->data_offset + self->data_size, SEEK_SET); + lseek(fd, header->data_offset + header->data_size, SEEK_SET); - self->frozen = 1; + header->frozen = 1; return 0; } -static int perf_header__getbuffer64(struct perf_header *self, +static int perf_header__getbuffer64(struct perf_header *header, int fd, void *buf, size_t size) { if (readn(fd, buf, size) <= 0) return -1; - if (self->needs_swap) + if (header->needs_swap) mem_bswap_64(buf, size); return 0; } -int perf_header__process_sections(struct perf_header *self, int fd, - int (*process)(struct perf_file_section *self, +int perf_header__process_sections(struct perf_header *header, int fd, + int (*process)(struct perf_file_section *section, struct perf_header *ph, int feat, int fd)) { @@ -555,7 +555,7 @@ int perf_header__process_sections(struct perf_header *self, int fd, int idx = 0; int err = -1, feat = 1; - nr_sections = bitmap_weight(self->adds_features, HEADER_FEAT_BITS); + nr_sections = bitmap_weight(header->adds_features, HEADER_FEAT_BITS); if (!nr_sections) return 0; @@ -565,17 +565,17 @@ int perf_header__process_sections(struct perf_header *self, int fd, sec_size = sizeof(*feat_sec) * nr_sections; - lseek(fd, self->data_offset + self->data_size, SEEK_SET); + lseek(fd, header->data_offset + header->data_size, SEEK_SET); - if (perf_header__getbuffer64(self, fd, feat_sec, sec_size)) + if (perf_header__getbuffer64(header, fd, feat_sec, sec_size)) goto out_free; err = 0; while (idx < nr_sections && feat < HEADER_LAST_FEATURE) { - if (perf_header__has_feat(self, feat)) { + if (perf_header__has_feat(header, feat)) { struct perf_file_section *sec = &feat_sec[idx++]; - err = process(sec, self, feat, fd); + err = process(sec, header, feat, fd); if (err < 0) break; } @@ -586,35 +586,35 @@ out_free: return err; } -int perf_file_header__read(struct perf_file_header *self, +int perf_file_header__read(struct perf_file_header *header, struct perf_header *ph, int fd) { lseek(fd, 0, SEEK_SET); - if (readn(fd, self, sizeof(*self)) <= 0 || - memcmp(&self->magic, __perf_magic, sizeof(self->magic))) + if (readn(fd, header, sizeof(*header)) <= 0 || + memcmp(&header->magic, __perf_magic, sizeof(header->magic))) return -1; - if (self->attr_size != sizeof(struct perf_file_attr)) { - u64 attr_size = bswap_64(self->attr_size); + if (header->attr_size != sizeof(struct perf_file_attr)) { + u64 attr_size = bswap_64(header->attr_size); if (attr_size != sizeof(struct perf_file_attr)) return -1; - mem_bswap_64(self, offsetof(struct perf_file_header, + mem_bswap_64(header, offsetof(struct perf_file_header, adds_features)); ph->needs_swap = true; } - if (self->size != sizeof(*self)) { + if (header->size != sizeof(*header)) { /* Support the previous format */ - if (self->size == offsetof(typeof(*self), adds_features)) - bitmap_zero(self->adds_features, HEADER_FEAT_BITS); + if (header->size == offsetof(typeof(*header), adds_features)) + bitmap_zero(header->adds_features, HEADER_FEAT_BITS); else return -1; } - memcpy(&ph->adds_features, &self->adds_features, + memcpy(&ph->adds_features, &header->adds_features, sizeof(ph->adds_features)); /* * FIXME: hack that assumes that if we need swap the perf.data file @@ -628,10 +628,10 @@ int perf_file_header__read(struct perf_file_header *self, perf_header__set_feat(ph, HEADER_BUILD_ID); } - ph->event_offset = self->event_types.offset; - ph->event_size = self->event_types.size; - ph->data_offset = self->data.offset; - ph->data_size = self->data.size; + ph->event_offset = header->event_types.offset; + ph->event_size = header->event_types.size; + ph->data_offset = header->data.offset; + ph->data_size = header->data.size; return 0; } @@ -690,11 +690,10 @@ out: return err; } -static int perf_header__read_build_ids(struct perf_header *self, - int input, u64 offset, u64 size) +static int perf_header__read_build_ids(struct perf_header *header, + int input, u64 offset, u64 size) { - struct perf_session *session = container_of(self, - struct perf_session, header); + struct perf_session *session = container_of(header, struct perf_session, header); struct build_id_event bev; char filename[PATH_MAX]; u64 limit = offset + size; @@ -706,7 +705,7 @@ static int perf_header__read_build_ids(struct perf_header *self, if (read(input, &bev, sizeof(bev)) != sizeof(bev)) goto out; - if (self->needs_swap) + if (header->needs_swap) perf_event_header__bswap(&bev.header); len = bev.header.size - sizeof(bev); @@ -722,13 +721,13 @@ out: return err; } -static int perf_file_section__process(struct perf_file_section *self, +static int perf_file_section__process(struct perf_file_section *section, struct perf_header *ph, int feat, int fd) { - if (lseek(fd, self->offset, SEEK_SET) == (off_t)-1) { + if (lseek(fd, section->offset, SEEK_SET) == (off_t)-1) { pr_debug("Failed to lseek to %" PRIu64 " offset for feature " - "%d, continuing...\n", self->offset, feat); + "%d, continuing...\n", section->offset, feat); return 0; } @@ -738,7 +737,7 @@ static int perf_file_section__process(struct perf_file_section *self, break; case HEADER_BUILD_ID: - if (perf_header__read_build_ids(ph, fd, self->offset, self->size)) + if (perf_header__read_build_ids(ph, fd, section->offset, section->size)) pr_debug("Failed to read buildids, continuing...\n"); break; default: @@ -748,21 +747,21 @@ static int perf_file_section__process(struct perf_file_section *self, return 0; } -static int perf_file_header__read_pipe(struct perf_pipe_file_header *self, +static int perf_file_header__read_pipe(struct perf_pipe_file_header *header, struct perf_header *ph, int fd, bool repipe) { - if (readn(fd, self, sizeof(*self)) <= 0 || - memcmp(&self->magic, __perf_magic, sizeof(self->magic))) + if (readn(fd, header, sizeof(*header)) <= 0 || + memcmp(&header->magic, __perf_magic, sizeof(header->magic))) return -1; - if (repipe && do_write(STDOUT_FILENO, self, sizeof(*self)) < 0) + if (repipe && do_write(STDOUT_FILENO, header, sizeof(*header)) < 0) return -1; - if (self->size != sizeof(*self)) { - u64 size = bswap_64(self->size); + if (header->size != sizeof(*header)) { + u64 size = bswap_64(header->size); - if (size != sizeof(*self)) + if (size != sizeof(*header)) return -1; ph->needs_swap = true; @@ -773,10 +772,10 @@ static int perf_file_header__read_pipe(struct perf_pipe_file_header *self, static int perf_header__read_pipe(struct perf_session *session, int fd) { - struct perf_header *self = &session->header; + struct perf_header *header = &session->header; struct perf_pipe_file_header f_header; - if (perf_file_header__read_pipe(&f_header, self, fd, + if (perf_file_header__read_pipe(&f_header, header, fd, session->repipe) < 0) { pr_debug("incompatible file format\n"); return -EINVAL; @@ -789,7 +788,7 @@ static int perf_header__read_pipe(struct perf_session *session, int fd) int perf_session__read_header(struct perf_session *session, int fd) { - struct perf_header *self = &session->header; + struct perf_header *header = &session->header; struct perf_file_header f_header; struct perf_file_attr f_attr; u64 f_id; @@ -802,7 +801,7 @@ int perf_session__read_header(struct perf_session *session, int fd) if (session->fd_pipe) return perf_header__read_pipe(session, fd); - if (perf_file_header__read(&f_header, self, fd) < 0) { + if (perf_file_header__read(&f_header, header, fd) < 0) { pr_debug("incompatible file format\n"); return -EINVAL; } @@ -814,7 +813,7 @@ int perf_session__read_header(struct perf_session *session, int fd) struct perf_evsel *evsel; off_t tmp; - if (perf_header__getbuffer64(self, fd, &f_attr, sizeof(f_attr))) + if (perf_header__getbuffer64(header, fd, &f_attr, sizeof(f_attr))) goto out_errno; tmp = lseek(fd, 0, SEEK_CUR); @@ -840,7 +839,7 @@ int perf_session__read_header(struct perf_session *session, int fd) lseek(fd, f_attr.ids.offset, SEEK_SET); for (j = 0; j < nr_ids; j++) { - if (perf_header__getbuffer64(self, fd, &f_id, sizeof(f_id))) + if (perf_header__getbuffer64(header, fd, &f_id, sizeof(f_id))) goto out_errno; perf_evlist__id_add(session->evlist, evsel, 0, j, f_id); @@ -854,17 +853,17 @@ int perf_session__read_header(struct perf_session *session, int fd) events = malloc(f_header.event_types.size); if (events == NULL) return -ENOMEM; - if (perf_header__getbuffer64(self, fd, events, + if (perf_header__getbuffer64(header, fd, events, f_header.event_types.size)) goto out_errno; event_count = f_header.event_types.size / sizeof(struct perf_trace_event_type); } - perf_header__process_sections(self, fd, perf_file_section__process); + perf_header__process_sections(header, fd, perf_file_section__process); - lseek(fd, self->data_offset, SEEK_SET); + lseek(fd, header->data_offset, SEEK_SET); - self->frozen = 1; + header->frozen = 1; return 0; out_errno: return -errno; diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index 4cc267559bb6..456661d7f10e 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -39,7 +39,7 @@ struct perf_pipe_file_header { struct perf_header; -int perf_file_header__read(struct perf_file_header *self, +int perf_file_header__read(struct perf_file_header *header, struct perf_header *ph, int fd); struct perf_header { @@ -66,12 +66,12 @@ char *perf_header__find_event(u64 id); u64 perf_evlist__sample_type(struct perf_evlist *evlist); bool perf_evlist__sample_id_all(const struct perf_evlist *evlist); -void perf_header__set_feat(struct perf_header *self, int feat); -void perf_header__clear_feat(struct perf_header *self, int feat); -bool perf_header__has_feat(const struct perf_header *self, int feat); +void perf_header__set_feat(struct perf_header *header, int feat); +void perf_header__clear_feat(struct perf_header *header, int feat); +bool perf_header__has_feat(const struct perf_header *header, int feat); -int perf_header__process_sections(struct perf_header *self, int fd, - int (*process)(struct perf_file_section *self, +int perf_header__process_sections(struct perf_header *header, int fd, + int (*process)(struct perf_file_section *section, struct perf_header *ph, int feat, int fd)); From e6e1e2593592a8f6f6380496655d8c6f67431266 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Mar 2011 10:41:56 -0500 Subject: [PATCH 579/706] tracing: Remove lock_depth from event entry The lock_depth field in the event headers was added as a temporary data point for help in removing the BKL. Now that the BKL is pretty much been removed, we can remove this field. This in turn changes the header from 12 bytes to 8 bytes, removing the 4 byte buffer that gcc would insert if the first field in the data load was 8 bytes in size. Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 1 - kernel/trace/trace.c | 8 +++----- kernel/trace/trace_events.c | 1 - kernel/trace/trace_output.c | 10 ++-------- 4 files changed, 5 insertions(+), 15 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 1a99e7939c2b..22b32af1b5ec 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -37,7 +37,6 @@ struct trace_entry { unsigned char flags; unsigned char preempt_count; int pid; - int lock_depth; }; #define FTRACE_MAX_EVENT \ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 85e3ee1e474e..fd6e1b906b3c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1101,7 +1101,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, entry->preempt_count = pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; - entry->lock_depth = (tsk) ? tsk->lock_depth : 0; entry->flags = #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | @@ -1748,10 +1747,9 @@ static void print_lat_help_header(struct seq_file *m) seq_puts(m, "# | / _----=> need-resched \n"); seq_puts(m, "# || / _---=> hardirq/softirq \n"); seq_puts(m, "# ||| / _--=> preempt-depth \n"); - seq_puts(m, "# |||| /_--=> lock-depth \n"); - seq_puts(m, "# |||||/ delay \n"); - seq_puts(m, "# cmd pid |||||| time | caller \n"); - seq_puts(m, "# \\ / |||||| \\ | / \n"); + seq_puts(m, "# |||| / delay \n"); + seq_puts(m, "# cmd pid ||||| time | caller \n"); + seq_puts(m, "# \\ / ||||| \\ | / \n"); } static void print_func_help_header(struct seq_file *m) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5f499e0438a4..e1d579b19834 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -116,7 +116,6 @@ static int trace_define_common_fields(void) __common_field(unsigned char, flags); __common_field(unsigned char, preempt_count); __common_field(int, pid); - __common_field(int, lock_depth); return ret; } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 02272baa2206..151f32e5f2c6 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -529,7 +529,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) * @entry: The trace entry field from the ring buffer * * Prints the generic fields of irqs off, in hard or softirq, preempt - * count and lock depth. + * count. */ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) { @@ -554,13 +554,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) else ret = trace_seq_putc(s, '.'); - if (!ret) - return 0; - - if (entry->lock_depth < 0) - return trace_seq_putc(s, '.'); - - return trace_seq_printf(s, "%d", entry->lock_depth); + return ret; } static int From 140e4f2d1cd816aed196705c036763313c0e4bd3 Mon Sep 17 00:00:00 2001 From: David Sharp Date: Fri, 3 Dec 2010 16:13:19 -0800 Subject: [PATCH 580/706] tracing: Fix event alignment: ftrace:context_switch and ftrace:wakeup Signed-off-by: David Sharp LKML-Reference: <1291421609-14665-6-git-send-email-dhsharp@google.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_entries.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 6cf223764be8..1516cb3ec549 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -109,12 +109,12 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, */ #define FTRACE_CTX_FIELDS \ __field( unsigned int, prev_pid ) \ + __field( unsigned int, next_pid ) \ + __field( unsigned int, next_cpu ) \ __field( unsigned char, prev_prio ) \ __field( unsigned char, prev_state ) \ - __field( unsigned int, next_pid ) \ __field( unsigned char, next_prio ) \ - __field( unsigned char, next_state ) \ - __field( unsigned int, next_cpu ) + __field( unsigned char, next_state ) FTRACE_ENTRY(context_switch, ctx_switch_entry, From b5e3008e489f5a00c6d5db914a4c4338c9ef5e8b Mon Sep 17 00:00:00 2001 From: David Sharp Date: Fri, 3 Dec 2010 16:13:20 -0800 Subject: [PATCH 581/706] tracing: Fix event alignment: module:module_request Acked-by: Li Zefan Signed-off-by: David Sharp LKML-Reference: <1291421609-14665-7-git-send-email-dhsharp@google.com> Signed-off-by: Steven Rostedt --- include/trace/events/module.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/trace/events/module.h b/include/trace/events/module.h index c6bae36547e5..21a546d27c0c 100644 --- a/include/trace/events/module.h +++ b/include/trace/events/module.h @@ -108,14 +108,14 @@ TRACE_EVENT(module_request, TP_ARGS(name, wait, ip), TP_STRUCT__entry( - __field( bool, wait ) __field( unsigned long, ip ) + __field( bool, wait ) __string( name, name ) ), TP_fast_assign( - __entry->wait = wait; __entry->ip = ip; + __entry->wait = wait; __assign_str(name, name); ), @@ -129,4 +129,3 @@ TRACE_EVENT(module_request, /* This part must be outside protection */ #include - From d5bf2ff07230a4a1b73ecb22363f77c02e1d85ab Mon Sep 17 00:00:00 2001 From: David Sharp Date: Fri, 3 Dec 2010 16:13:21 -0800 Subject: [PATCH 582/706] tracing: Fix event alignment: kvm:kvm_hv_hypercall Acked-by: Avi Kivity Signed-off-by: David Sharp LKML-Reference: <1291421609-14665-8-git-send-email-dhsharp@google.com> Signed-off-by: Steven Rostedt --- arch/x86/kvm/trace.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 1357d7cf4ec8..db932760ea82 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -62,21 +62,21 @@ TRACE_EVENT(kvm_hv_hypercall, TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa), TP_STRUCT__entry( - __field( __u16, code ) - __field( bool, fast ) __field( __u16, rep_cnt ) __field( __u16, rep_idx ) __field( __u64, ingpa ) __field( __u64, outgpa ) + __field( __u16, code ) + __field( bool, fast ) ), TP_fast_assign( - __entry->code = code; - __entry->fast = fast; __entry->rep_cnt = rep_cnt; __entry->rep_idx = rep_idx; __entry->ingpa = ingpa; __entry->outgpa = outgpa; + __entry->code = code; + __entry->fast = fast; ), TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx", From ad440ad66f1617194738bf674dfe2d38978ac54d Mon Sep 17 00:00:00 2001 From: David Sharp Date: Fri, 3 Dec 2010 16:13:22 -0800 Subject: [PATCH 583/706] tracing: Fix event alignment: mce:mce_record Signed-off-by: David Sharp LKML-Reference: <1291421609-14665-9-git-send-email-dhsharp@google.com> Signed-off-by: Steven Rostedt --- include/trace/events/mce.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/trace/events/mce.h b/include/trace/events/mce.h index 7eee77895cb3..4cbbcef6baa8 100644 --- a/include/trace/events/mce.h +++ b/include/trace/events/mce.h @@ -17,36 +17,36 @@ TRACE_EVENT(mce_record, TP_STRUCT__entry( __field( u64, mcgcap ) __field( u64, mcgstatus ) - __field( u8, bank ) __field( u64, status ) __field( u64, addr ) __field( u64, misc ) __field( u64, ip ) - __field( u8, cs ) __field( u64, tsc ) __field( u64, walltime ) __field( u32, cpu ) __field( u32, cpuid ) __field( u32, apicid ) __field( u32, socketid ) + __field( u8, cs ) + __field( u8, bank ) __field( u8, cpuvendor ) ), TP_fast_assign( __entry->mcgcap = m->mcgcap; __entry->mcgstatus = m->mcgstatus; - __entry->bank = m->bank; __entry->status = m->status; __entry->addr = m->addr; __entry->misc = m->misc; __entry->ip = m->ip; - __entry->cs = m->cs; __entry->tsc = m->tsc; __entry->walltime = m->time; __entry->cpu = m->extcpu; __entry->cpuid = m->cpuid; __entry->apicid = m->apicid; __entry->socketid = m->socketid; + __entry->cs = m->cs; + __entry->bank = m->bank; __entry->cpuvendor = m->cpuvendor; ), From ca9da2dd63b0b32de1b693953dff66cadeb6400b Mon Sep 17 00:00:00 2001 From: David Sharp Date: Fri, 3 Dec 2010 16:13:23 -0800 Subject: [PATCH 584/706] tracing: Fix event alignment: skb:kfree_skb Acked-by: Neil Horman Signed-off-by: David Sharp LKML-Reference: <1291421609-14665-10-git-send-email-dhsharp@google.com> Signed-off-by: Steven Rostedt --- include/trace/events/skb.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index f10293c41b1e..0c68ae22da22 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -19,14 +19,14 @@ TRACE_EVENT(kfree_skb, TP_STRUCT__entry( __field( void *, skbaddr ) - __field( unsigned short, protocol ) __field( void *, location ) + __field( unsigned short, protocol ) ), TP_fast_assign( __entry->skbaddr = skb; - __entry->protocol = ntohs(skb->protocol); __entry->location = location; + __entry->protocol = ntohs(skb->protocol); ), TP_printk("skbaddr=%p protocol=%u location=%p", From 10da37a645b5e915d8572cc2b1f5eb11ada3ea4f Mon Sep 17 00:00:00 2001 From: David Sharp Date: Fri, 3 Dec 2010 16:13:26 -0800 Subject: [PATCH 585/706] tracing: Adjust conditional expression latency formatting. Formatting change only to improve code readability. No code changes except to introduce intermediate variables. Signed-off-by: David Sharp LKML-Reference: <1291421609-14665-13-git-send-email-dhsharp@google.com> [ Keep variable declarations and assignment separate ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 151f32e5f2c6..456be9063c2d 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -533,20 +533,30 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) */ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) { - int hardirq, softirq; + char hardsoft_irq; + char need_resched; + char irqs_off; + int hardirq; + int softirq; int ret; hardirq = entry->flags & TRACE_FLAG_HARDIRQ; softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + irqs_off = + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : + (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : + '.'; + need_resched = + (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'; + hardsoft_irq = + (hardirq && softirq) ? 'H' : + hardirq ? 'h' : + softirq ? 's' : + '.'; + if (!trace_seq_printf(s, "%c%c%c", - (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : - (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? - 'X' : '.', - (entry->flags & TRACE_FLAG_NEED_RESCHED) ? - 'N' : '.', - (hardirq && softirq) ? 'H' : - hardirq ? 'h' : softirq ? 's' : '.')) + irqs_off, need_resched, hardsoft_irq)) return 0; if (entry->preempt_count) From 1274a9c2e91652e28efa45c3e5886ec82f08bfbe Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 11 Feb 2011 16:43:33 -0500 Subject: [PATCH 586/706] ftrace: Add .ref.text as one of the safe areas to trace The section .ref.text will not go away unexpectedly and is safe to trace. Add it to the safe list of sections to allow tracing. Signed-off-by: Steven Rostedt --- scripts/recordmcount.c | 3 ++- scripts/recordmcount.pl | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index 038b3d1e2981..f9f6f52db772 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -206,7 +206,8 @@ static uint32_t (*w2)(uint16_t); static int is_mcounted_section_name(char const *const txtname) { - return 0 == strcmp(".text", txtname) || + return 0 == strcmp(".text", txtname) || + 0 == strcmp(".ref.text", txtname) || 0 == strcmp(".sched.text", txtname) || 0 == strcmp(".spinlock.text", txtname) || 0 == strcmp(".irqentry.text", txtname) || diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 1d7963f4ee79..4be0deea71ca 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -130,6 +130,7 @@ if ($inputfile =~ m,kernel/trace/ftrace\.o$,) { # Acceptable sections to record. my %text_sections = ( ".text" => 1, + ".ref.text" => 1, ".sched.text" => 1, ".spinlock.text" => 1, ".irqentry.text" => 1, From 722b3c74695377d11d18a52f3da08114d37f3f37 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 11 Feb 2011 20:36:02 -0500 Subject: [PATCH 587/706] ftrace/graph: Trace function entry before updating index Currently the index to the ret_stack is updated and the real return address is saved in the ret_stack. Then we call the trace function. The trace function could decide that it doesn't want to trace this function (ex. set_graph_function does not match) and it will return 0 which means not to trace this call. The normal function graph tracer has this code: if (!(trace->depth || ftrace_graph_addr(trace->func)) || ftrace_graph_ignore_irqs()) return 0; What this states is, if the trace depth (which is curr_ret_stack) is zero (top of nested functions) then test if we want to trace this function. If this function is not to be traced, then return 0 and the rest of the function graph tracer logic will not trace this function. The problem arises when an interrupt comes in after we updated the curr_ret_stack. The next function that gets called will have a trace->depth of 1. Which fools this trace code into thinking that we are in a nested function, and that we should trace. This causes interrupts to be traced when they should not be. The solution is to trace the function first and then update the ret_stack. Reported-by: zhiping zhong Reported-by: wu zhangjin Signed-off-by: Steven Rostedt --- arch/x86/kernel/ftrace.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 382eb2936d4d..a93742a57468 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -437,18 +437,19 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, return; } + trace.func = self_addr; + trace.depth = current->curr_ret_stack + 1; + + /* Only trace if the calling function expects to */ + if (!ftrace_graph_entry(&trace)) { + *parent = old; + return; + } + if (ftrace_push_return_trace(old, self_addr, &trace.depth, frame_pointer) == -EBUSY) { *parent = old; return; } - - trace.func = self_addr; - - /* Only trace if the calling function expects to */ - if (!ftrace_graph_entry(&trace)) { - current->curr_ret_stack--; - *parent = old; - } } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ From 31274d72f01604f4b02d933b4f3cac84d2c201fd Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 18 Feb 2011 15:52:19 +0100 Subject: [PATCH 588/706] tracing: Explain about unstable clock on resume with ring buffer warning The "Delta way too big" warning might appear on a system with a unstable shed clock right after the system is resumed and tracing was enabled at time of suspend. Since it's not realy a bug, and the unstable sched clock is working fast and reliable otherwise, Steven suggested to keep using the sched clock in any case and just to make note in the warning itself. v2 changes: - added #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK Signed-off-by: Jiri Olsa LKML-Reference: <20110218145219.GD2604@jolsa.brq.redhat.com> Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3237d961d905..db7b439d23ee 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2172,11 +2172,19 @@ rb_reserve_next_event(struct ring_buffer *buffer, if (likely(ts >= cpu_buffer->write_stamp)) { delta = diff; if (unlikely(test_time_stamp(delta))) { + int local_clock_stable = 1; +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK + local_clock_stable = sched_clock_stable; +#endif WARN_ONCE(delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", + KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", (unsigned long long)delta, (unsigned long long)ts, - (unsigned long long)cpu_buffer->write_stamp); + (unsigned long long)cpu_buffer->write_stamp, + local_clock_stable ? "" : + "If you just came from a suspend/resume,\n" + "please switch to the trace global clock:\n" + " echo global > /sys/kernel/debug/tracing/trace_clock\n"); add_timestamp = 1; } } From 56355b83e2a24ce7e1870c8479205e2cdd332225 Mon Sep 17 00:00:00 2001 From: Yuanhan Liu Date: Mon, 8 Nov 2010 14:05:12 +0800 Subject: [PATCH 589/706] tracing: Export trace_set_clr_event() Trace events belonging to a module only exists when the module is loaded. Well, we can use trace_set_clr_event funtion to enable some trace event at the module init routine, so that we will not miss something while loading then module. So, Export the trace_set_clr_event function so that module can use it. Signed-off-by: Yuanhan Liu LKML-Reference: <1289196312-25323-1-git-send-email-yuanhan.liu@linux.intel.com> Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e1d579b19834..e88f74fe1d4c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -325,6 +325,7 @@ int trace_set_clr_event(const char *system, const char *event, int set) { return __ftrace_set_clr_event(NULL, system, event, set); } +EXPORT_SYMBOL_GPL(trace_set_clr_event); /* 128 should be much more than enough */ #define EVENT_BUF_SIZE 127 From 9a24470b2826e4665b1484836c7ae6aba1ddea32 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Mar 2011 14:53:38 -0500 Subject: [PATCH 590/706] tracing: Align 4 byte ints together in struct tracer Move elements in struct tracer for better alignment. Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 951d0b7e7062..5e9dfc6286dd 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -272,8 +272,8 @@ struct tracer { /* If you handled the flag setting, return 0 */ int (*set_flag)(u32 old_flags, u32 bit, int set); struct tracer *next; - int print_max; struct tracer_flags *flags; + int print_max; int use_max_tr; }; From 4a0b1665db09cf2da9ad7d0f12da386373c10bfa Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Mar 2011 20:09:26 -0500 Subject: [PATCH 591/706] tracing: Fix irqoff selftest expanding max buffer If the kernel command line declares a tracer "ftrace=sometracer" and that tracer is either not defined or is enabled after irqsoff, then the irqs off selftest will fail with the following error: Testing tracer irqsoff: ------------[ cut here ]------------ WARNING: at /home/rostedt/work/autotest/nobackup/linux-test.git/kernel/trace/tra ce.c:713 update_max_tr_single+0xfa/0x11b() Hardware name: Modules linked in: Pid: 1, comm: swapper Not tainted 2.6.38-rc8-test #1 Call Trace: [] ? warn_slowpath_common+0x65/0x7a [] ? update_max_tr_single+0xfa/0x11b [] ? warn_slowpath_null+0xf/0x13 [] ? update_max_tr_single+0xfa/0x11b [] ? stop_critical_timing+0x154/0x204 [] ? trace_selftest_startup_irqsoff+0x5b/0xc1 [] ? trace_selftest_startup_irqsoff+0x5b/0xc1 [] ? trace_selftest_startup_irqsoff+0x5b/0xc1 [] ? time_hardirqs_on+0x25/0x28 [] ? trace_hardirqs_on_caller+0x18/0x12f [] ? trace_hardirqs_on+0xb/0xd [] ? trace_selftest_startup_irqsoff+0x5b/0xc1 [] ? register_tracer+0xf8/0x1a3 [] ? init_irqsoff_tracer+0xd/0x11 [] ? do_one_initcall+0x71/0x121 [] ? init_irqsoff_tracer+0x0/0x11 [] ? kernel_init+0x13a/0x1b6 [] ? kernel_init+0x0/0x1b6 [] ? kernel_thread_helper+0x6/0x10 ---[ end trace e93713a9d40cd06c ]--- .. no entries found ..FAILED! What happens is the "ftrace=..." will expand the ring buffer to its default size (from its minimum size) but it will not expand the max ring buffer (the ring buffer to store maximum latencies). When the irqsoff test runs, it will call the ring buffer swap routine that checks if the max ring buffer is the same size as the normal ring buffer, and will fail if it is not. This causes the test to fail. The solution is to expand the max ring buffer before running the self test if the max ring buffer is used by that tracer and the normal ring buffer is expanded. The max ring buffer should be shrunk again after the test is done to save space. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fd6e1b906b3c..9541c27c1cf2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -779,6 +779,11 @@ __acquires(kernel_lock) tracing_reset_online_cpus(tr); current_trace = type; + + /* If we expanded the buffers, make sure the max is expanded too */ + if (ring_buffer_expanded && type->use_max_tr) + ring_buffer_resize(max_tr.buffer, trace_buf_size); + /* the test is responsible for initializing and enabling */ pr_info("Testing tracer %s: ", type->name); ret = type->selftest(type, tr); @@ -791,6 +796,10 @@ __acquires(kernel_lock) /* Only reset on passing, to avoid touching corrupted buffers */ tracing_reset_online_cpus(tr); + /* Shrink the max buffer again */ + if (ring_buffer_expanded && type->use_max_tr) + ring_buffer_resize(max_tr.buffer, 1); + printk(KERN_CONT "PASSED\n"); } #endif From 53370d2e8c0382e3e2aa76def93365ed674e7fc7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Mar 2011 18:26:33 +0100 Subject: [PATCH 592/706] hrtimer: Update hrtimer->state documentation We changed some of the state bits and combinations thereof over time, but never updated the documentation. Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 6bc1804bfbfa..62f500c724f9 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -54,11 +54,13 @@ enum hrtimer_restart { * 0x00 inactive * 0x01 enqueued into rbtree * 0x02 callback function running + * 0x04 timer is migrated to another cpu * * Special cases: * 0x03 callback function running and enqueued * (was requeued on another CPU) - * 0x09 timer was migrated on CPU hotunplug + * 0x05 timer was migrated on CPU hotunplug + * * The "callback function running and enqueued" status is only possible on * SMP. It happens for example when a posix timer expired and the callback * queued a signal. Between dropping the lock which protects the posix timer @@ -67,8 +69,11 @@ enum hrtimer_restart { * as otherwise the timer could be removed before the softirq code finishes the * the handling of the timer. * - * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state to - * preserve the HRTIMER_STATE_CALLBACK bit in the above scenario. + * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state + * to preserve the HRTIMER_STATE_CALLBACK in the above scenario. This + * also affects HRTIMER_STATE_MIGRATE where the preservation is not + * necessary. HRTIMER_STATE_MIGRATE is cleared after the timer is + * enqueued on the new cpu. * * All state transitions are protected by cpu_base->lock. */ @@ -376,8 +381,9 @@ extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp); extern ktime_t hrtimer_get_next_event(void); /* - * A timer is active, when it is enqueued into the rbtree or the callback - * function is running. + * A timer is active, when it is enqueued into the rbtree or the + * callback function is running or it's in the state of being migrated + * to another cpu. */ static inline int hrtimer_active(const struct hrtimer *timer) { From a9e7acfff0a279792918b7b0de74106e576e9988 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Mar 2011 19:12:24 +0100 Subject: [PATCH 593/706] hrtimer: Remove empty hrtimer_init_hres_timer() Leftover from earlier implementation. All empty, remove it. Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index f679a2d08723..63fb74972b75 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -680,14 +680,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) base->hres_active = 0; } -/* - * Initialize the high resolution related parts of a hrtimer - */ -static inline void hrtimer_init_timer_hres(struct hrtimer *timer) -{ -} - - /* * When High resolution timers are active, try to reprogram. Note, that in case * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry @@ -759,7 +751,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, return 0; } static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } -static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } #endif /* CONFIG_HIGH_RES_TIMERS */ @@ -1141,7 +1132,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, base = hrtimer_clockid_to_base(clock_id); timer->base = &cpu_base->clock_base[base]; - hrtimer_init_timer_hres(timer); timerqueue_init(&timer->node); #ifdef CONFIG_TIMER_STATS From 8fe8f545c6d753ead15e1f4919d39e8f9bb49629 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Sun, 6 Mar 2011 18:07:50 -0800 Subject: [PATCH 594/706] futex: Update futex_wait_setup comments about locking Reviving a cleanup I had done about a year ago as part of a larger futex_set_wait proposal. Over the years, the locking of the hashed futex queue got improved, so that some of the "rare but normal" race conditions described in comments can't actually happen anymore. Signed-off-by: Michel Lespinasse Cc: Linus Torvalds Cc: Darren Hart Cc: Peter Zijlstra LKML-Reference: <20110307020750.GA31188@google.com> Signed-off-by: Thomas Gleixner --- kernel/futex.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index b766d28accd6..3184d3b9cadf 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1781,13 +1781,14 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, * * The basic logical guarantee of a futex is that it blocks ONLY * if cond(var) is known to be true at the time of blocking, for - * any cond. If we queued after testing *uaddr, that would open - * a race condition where we could block indefinitely with + * any cond. If we locked the hash-bucket after testing *uaddr, that + * would open a race condition where we could block indefinitely with * cond(var) false, which would violate the guarantee. * - * A consequence is that futex_wait() can return zero and absorb - * a wakeup when *uaddr != val on entry to the syscall. This is - * rare, but normal. + * On the other hand, we insert q and release the hash-bucket only + * after testing *uaddr. This guarantees that futex_wait() will NOT + * absorb a wakeup if *uaddr does not match the desired values + * while the syscall executes. */ retry: ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key); From 260a7d4cfd26d8bad8ac3a7fce11de47491d7e00 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 18 Feb 2011 16:43:26 +0000 Subject: [PATCH 595/706] xen: pci: only define xen_initdom_setup_msi_irqs if CONFIG_XEN_DOM0 Fixes: CC arch/x86/pci/xen.o arch/x86/pci/xen.c:183: warning: 'xen_initdom_setup_msi_irqs' defined but not used Signed-off-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/pci/xen.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 8634e1b49c03..47c4688dcd48 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -203,6 +203,7 @@ static void xen_teardown_msi_irq(unsigned int irq) xen_destroy_irq(irq); } +#ifdef CONFIG_XEN_DOM0 static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { int irq, ret; @@ -224,6 +225,7 @@ error: return ret; } #endif +#endif static int xen_pcifront_enable_irq(struct pci_dev *dev) { From ae1635b05fae30804061406010914d85d12431ac Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 18 Feb 2011 16:43:27 +0000 Subject: [PATCH 596/706] xen: events: do not leak IRQ from xen_allocate_pirq_msi when no pirq available. Cc: Jeremy Fitzhardinge Cc: xen-devel@lists.xensource.com Signed-off-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/events.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 89987a7bf26f..bce303590075 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -676,8 +676,11 @@ void xen_allocate_pirq_msi(char *name, int *irq, int *pirq, int alloc) if (alloc & XEN_ALLOC_PIRQ) { *pirq = find_unbound_pirq(MAP_PIRQ_TYPE_MSI); - if (*pirq == -1) + if (*pirq == -1) { + xen_free_irq(*irq); + *irq = -1; goto out; + } } set_irq_chip_and_handler_name(*irq, &xen_pirq_chip, From bb5d079aefa828c292c267ed34ed2282947fa233 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 18 Feb 2011 16:43:28 +0000 Subject: [PATCH 597/706] xen: events: drop XEN_ALLOC_IRQ flag to xen_allocate_pirq_msi All callers pass this flag so it is pointless. Signed-off-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/pci/xen.c | 6 +++--- drivers/xen/events.c | 12 +++++------- include/xen/events.h | 5 +---- 3 files changed, 9 insertions(+), 14 deletions(-) diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 47c4688dcd48..ca5fa09ca56d 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -101,7 +101,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) ((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff); if (xen_irq_from_pirq(pirq) >= 0 && msg.data == XEN_PIRQ_MSI_DATA) { xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ? - "msi-x" : "msi", &irq, &pirq, XEN_ALLOC_IRQ); + "msi-x" : "msi", &irq, &pirq, 0); if (irq < 0) goto error; ret = set_irq_msi(irq, msidesc); @@ -112,7 +112,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) return 0; } xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ? - "msi-x" : "msi", &irq, &pirq, (XEN_ALLOC_IRQ | XEN_ALLOC_PIRQ)); + "msi-x" : "msi", &irq, &pirq, 1); if (irq < 0 || pirq < 0) goto error; printk(KERN_DEBUG "xen: msi --> irq=%d, pirq=%d\n", irq, pirq); @@ -160,7 +160,7 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) xen_allocate_pirq_msi( (type == PCI_CAP_ID_MSIX) ? "pcifront-msi-x" : "pcifront-msi", - &irq, &v[i], XEN_ALLOC_IRQ); + &irq, &v[i], 0); if (irq < 0) { ret = -1; goto free; diff --git a/drivers/xen/events.c b/drivers/xen/events.c index bce303590075..36e9adcdebe9 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -664,17 +664,15 @@ static int find_unbound_pirq(int type) return -1; } -void xen_allocate_pirq_msi(char *name, int *irq, int *pirq, int alloc) +void xen_allocate_pirq_msi(char *name, int *irq, int *pirq, int alloc_pirq) { spin_lock(&irq_mapping_update_lock); - if (alloc & XEN_ALLOC_IRQ) { - *irq = xen_allocate_irq_dynamic(); - if (*irq == -1) - goto out; - } + *irq = xen_allocate_irq_dynamic(); + if (*irq == -1) + goto out; - if (alloc & XEN_ALLOC_PIRQ) { + if (alloc_pirq) { *pirq = find_unbound_pirq(MAP_PIRQ_TYPE_MSI); if (*pirq == -1) { xen_free_irq(*irq); diff --git a/include/xen/events.h b/include/xen/events.h index 00f53ddcc062..8d98861e4d92 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -75,10 +75,7 @@ int xen_allocate_pirq(unsigned gsi, int shareable, char *name); int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name); #ifdef CONFIG_PCI_MSI -/* Allocate an irq and a pirq to be used with MSIs. */ -#define XEN_ALLOC_PIRQ (1 << 0) -#define XEN_ALLOC_IRQ (1 << 1) -void xen_allocate_pirq_msi(char *name, int *irq, int *pirq, int alloc_mask); +void xen_allocate_pirq_msi(char *name, int *irq, int *pirq, int alloc_pirq); int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type); #endif From 4b41df7f6e0b5684378d9155773c42a4577e8582 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 18 Feb 2011 16:43:29 +0000 Subject: [PATCH 598/706] xen: events: return irq from xen_allocate_pirq_msi consistent with other similar functions. Signed-off-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/pci/xen.c | 12 ++++++------ drivers/xen/events.c | 19 +++++++++++-------- include/xen/events.h | 2 +- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index ca5fa09ca56d..6fd695b06faa 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -100,8 +100,8 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) | ((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff); if (xen_irq_from_pirq(pirq) >= 0 && msg.data == XEN_PIRQ_MSI_DATA) { - xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ? - "msi-x" : "msi", &irq, &pirq, 0); + irq = xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ? + "msi-x" : "msi", &pirq, 0); if (irq < 0) goto error; ret = set_irq_msi(irq, msidesc); @@ -111,8 +111,8 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) " pirq=%d\n", irq, pirq); return 0; } - xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ? - "msi-x" : "msi", &irq, &pirq, 1); + irq = xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ? + "msi-x" : "msi", &pirq, 1); if (irq < 0 || pirq < 0) goto error; printk(KERN_DEBUG "xen: msi --> irq=%d, pirq=%d\n", irq, pirq); @@ -157,10 +157,10 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) goto error; i = 0; list_for_each_entry(msidesc, &dev->msi_list, list) { - xen_allocate_pirq_msi( + irq = xen_allocate_pirq_msi( (type == PCI_CAP_ID_MSIX) ? "pcifront-msi-x" : "pcifront-msi", - &irq, &v[i], 0); + &v[i], 0); if (irq < 0) { ret = -1; goto free; diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 36e9adcdebe9..ed3420df0937 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -664,31 +664,34 @@ static int find_unbound_pirq(int type) return -1; } -void xen_allocate_pirq_msi(char *name, int *irq, int *pirq, int alloc_pirq) +int xen_allocate_pirq_msi(char *name, int *pirq, int alloc_pirq) { + int irq; + spin_lock(&irq_mapping_update_lock); - *irq = xen_allocate_irq_dynamic(); - if (*irq == -1) + irq = xen_allocate_irq_dynamic(); + if (irq == -1) goto out; if (alloc_pirq) { *pirq = find_unbound_pirq(MAP_PIRQ_TYPE_MSI); if (*pirq == -1) { - xen_free_irq(*irq); - *irq = -1; + xen_free_irq(irq); + irq = -1; goto out; } } - set_irq_chip_and_handler_name(*irq, &xen_pirq_chip, + set_irq_chip_and_handler_name(irq, &xen_pirq_chip, handle_level_irq, name); - irq_info[*irq] = mk_pirq_info(0, *pirq, 0, 0); - pirq_to_irq[*pirq] = *irq; + irq_info[irq] = mk_pirq_info(0, *pirq, 0, 0); + pirq_to_irq[*pirq] = irq; out: spin_unlock(&irq_mapping_update_lock); + return irq; } int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type) diff --git a/include/xen/events.h b/include/xen/events.h index 8d98861e4d92..f70536af921c 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -75,7 +75,7 @@ int xen_allocate_pirq(unsigned gsi, int shareable, char *name); int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name); #ifdef CONFIG_PCI_MSI -void xen_allocate_pirq_msi(char *name, int *irq, int *pirq, int alloc_pirq); +int xen_allocate_pirq_msi(char *name, int *pirq, int alloc_pirq); int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type); #endif From 9a626612c2010699d9909a4c3141d3a38660f3b3 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 18 Feb 2011 16:43:30 +0000 Subject: [PATCH 599/706] xen: pci: collapse apic_register_gsi_xen_hvm and xen_hvm_register_pirq apic_register_gsi_xen_hvm is a tiny wrapper around xen_hvm_register_pirq. Signed-off-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/pci/xen.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 6fd695b06faa..0d5087eeced8 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -20,7 +20,8 @@ #include #ifdef CONFIG_ACPI -static int xen_hvm_register_pirq(u32 gsi, int triggering) +static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi, + int trigger, int polarity) { int rc, irq; struct physdev_map_pirq map_irq; @@ -41,7 +42,7 @@ static int xen_hvm_register_pirq(u32 gsi, int triggering) return -1; } - if (triggering == ACPI_EDGE_SENSITIVE) { + if (trigger == ACPI_EDGE_SENSITIVE) { shareable = 0; name = "ioapic-edge"; } else { @@ -55,12 +56,6 @@ static int xen_hvm_register_pirq(u32 gsi, int triggering) return irq; } - -static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi, - int trigger, int polarity) -{ - return xen_hvm_register_pirq(gsi, trigger); -} #endif #if defined(CONFIG_PCI_MSI) From 5cad61a6ba6f4956a218ffbb64cafcc1daefaca0 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 18 Feb 2011 16:43:31 +0000 Subject: [PATCH 600/706] xen: events: assume PHYSDEVOP_get_free_pirq exists The find_unbound_pirq is called only from xen_allocate_pirq_msi and only if alloc_pirq is true. The only caller which does this is xen_hvm_setup_msi_irqs. The use of this function is gated, in pci_xen_hvm_init, on XENFEAT_hvm_pirqs. The PHYSDEVOP_get_free_pirq interfaces was added to the hypervisor in 22410:be96f6058c05 while XENFEAT_hvm_pirqs was added a couple of minutes prior in 22409:6663214f06ac. Therefore we do not need to concern ourselves with hypervisors which support XENFEAT_hvm_pirqs but not PHYSDEVOP_get_free_pirq. This eliminates the fallback path in find_unbound_pirq which walks to pirq_to_irq array looking for a free pirq. Unlike the PHYSDEVOP_get_free_pirq interface this fallback only looks up a free pirq but does not reserve it. Removing this fallback will simplify locking in the future. Signed-off-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/events.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index ed3420df0937..c21066fc30be 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -649,19 +649,16 @@ out: static int find_unbound_pirq(int type) { - int rc, i; + int rc; struct physdev_get_free_pirq op_get_free_pirq; + op_get_free_pirq.type = type; - rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_free_pirq, &op_get_free_pirq); - if (!rc) - return op_get_free_pirq.pirq; - for (i = 0; i < nr_irqs; i++) { - if (pirq_to_irq[i] < 0) - return i; - } - return -1; + WARN_ONCE(rc == -ENOSYS, + "hypervisor does not support the PHYSDEVOP_get_free_pirq interface\n"); + + return rc ? -1 : op_get_free_pirq.pirq; } int xen_allocate_pirq_msi(char *name, int *pirq, int alloc_pirq) From bf480d952bcf25e8ff7e95d2a23964107513ac51 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 18 Feb 2011 16:43:32 +0000 Subject: [PATCH 601/706] xen: events: separate MSI PIRQ allocation from PIRQ binding to IRQ Split the binding aspect of xen_allocate_pirq_msi out into a new xen_bind_pirq_to_irq function. In xen_hvm_setup_msi_irq when allocating a pirq write the MSI message to signal the PIRQ as soon as the pirq is obtained. There is no way to free the pirq back so if the subsequent binding to an IRQ fails we want to ensure that we will reuse the PIRQ next time rather than leak it. Signed-off-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/pci/xen.c | 68 +++++++++++++++++--------------------------- drivers/xen/events.c | 30 +++++++++---------- include/xen/events.h | 4 ++- 3 files changed, 43 insertions(+), 59 deletions(-) diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 0d5087eeced8..93e42152d8d0 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -86,7 +86,7 @@ static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq, static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { - int irq, pirq, ret = 0; + int irq, pirq; struct msi_desc *msidesc; struct msi_msg msg; @@ -94,39 +94,32 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) __read_msi_msg(msidesc, &msg); pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) | ((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff); - if (xen_irq_from_pirq(pirq) >= 0 && msg.data == XEN_PIRQ_MSI_DATA) { - irq = xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ? - "msi-x" : "msi", &pirq, 0); - if (irq < 0) + if (msg.data != XEN_PIRQ_MSI_DATA || + xen_irq_from_pirq(pirq) < 0) { + pirq = xen_allocate_pirq_msi(dev, msidesc); + if (pirq < 0) goto error; - ret = set_irq_msi(irq, msidesc); - if (ret < 0) - goto error_while; - printk(KERN_DEBUG "xen: msi already setup: msi --> irq=%d" - " pirq=%d\n", irq, pirq); - return 0; + xen_msi_compose_msg(dev, pirq, &msg); + __write_msi_msg(msidesc, &msg); + dev_dbg(&dev->dev, "xen: msi bound to pirq=%d\n", pirq); + } else { + dev_dbg(&dev->dev, + "xen: msi already bound to pirq=%d\n", pirq); } - irq = xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ? - "msi-x" : "msi", &pirq, 1); - if (irq < 0 || pirq < 0) + irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq, + (type == PCI_CAP_ID_MSIX) ? + "msi-x" : "msi"); + if (irq < 0) goto error; - printk(KERN_DEBUG "xen: msi --> irq=%d, pirq=%d\n", irq, pirq); - xen_msi_compose_msg(dev, pirq, &msg); - ret = set_irq_msi(irq, msidesc); - if (ret < 0) - goto error_while; - write_msi_msg(irq, &msg); + dev_dbg(&dev->dev, + "xen: msi --> pirq=%d --> irq=%d\n", pirq, irq); } return 0; -error_while: - unbind_from_irqhandler(irq, NULL); error: - if (ret == -ENODEV) - dev_err(&dev->dev, "Xen PCI frontend has not registered" \ - " MSI/MSI-X support!\n"); - - return ret; + dev_err(&dev->dev, + "Xen PCI frontend has not registered MSI/MSI-X support!\n"); + return -ENODEV; } /* @@ -152,28 +145,19 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) goto error; i = 0; list_for_each_entry(msidesc, &dev->msi_list, list) { - irq = xen_allocate_pirq_msi( - (type == PCI_CAP_ID_MSIX) ? - "pcifront-msi-x" : "pcifront-msi", - &v[i], 0); - if (irq < 0) { - ret = -1; + irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], + (type == PCI_CAP_ID_MSIX) ? + "pcifront-msi-x" : + "pcifront-msi"); + if (irq < 0) goto free; - } - ret = set_irq_msi(irq, msidesc); - if (ret) - goto error_while; i++; } kfree(v); return 0; -error_while: - unbind_from_irqhandler(irq, NULL); error: - if (ret == -ENODEV) - dev_err(&dev->dev, "Xen PCI frontend has not registered" \ - " MSI/MSI-X support!\n"); + dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n"); free: kfree(v); return ret; diff --git a/drivers/xen/events.c b/drivers/xen/events.c index c21066fc30be..1033f6284f31 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -647,12 +647,12 @@ out: #include #include "../pci/msi.h" -static int find_unbound_pirq(int type) +int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc) { int rc; struct physdev_get_free_pirq op_get_free_pirq; - op_get_free_pirq.type = type; + op_get_free_pirq.type = MAP_PIRQ_TYPE_MSI; rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_free_pirq, &op_get_free_pirq); WARN_ONCE(rc == -ENOSYS, @@ -661,9 +661,10 @@ static int find_unbound_pirq(int type) return rc ? -1 : op_get_free_pirq.pirq; } -int xen_allocate_pirq_msi(char *name, int *pirq, int alloc_pirq) +int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, + int pirq, const char *name) { - int irq; + int irq, ret; spin_lock(&irq_mapping_update_lock); @@ -671,24 +672,21 @@ int xen_allocate_pirq_msi(char *name, int *pirq, int alloc_pirq) if (irq == -1) goto out; - if (alloc_pirq) { - *pirq = find_unbound_pirq(MAP_PIRQ_TYPE_MSI); - if (*pirq == -1) { - xen_free_irq(irq); - irq = -1; - goto out; - } - } - set_irq_chip_and_handler_name(irq, &xen_pirq_chip, handle_level_irq, name); - irq_info[irq] = mk_pirq_info(0, *pirq, 0, 0); - pirq_to_irq[*pirq] = irq; - + irq_info[irq] = mk_pirq_info(0, pirq, 0, 0); + pirq_to_irq[pirq] = irq; + ret = set_irq_msi(irq, msidesc); + if (ret < 0) + goto error_irq; out: spin_unlock(&irq_mapping_update_lock); return irq; +error_irq: + spin_unlock(&irq_mapping_update_lock); + xen_free_irq(irq); + return -1; } int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type) diff --git a/include/xen/events.h b/include/xen/events.h index f70536af921c..18bf825bac66 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -75,7 +75,9 @@ int xen_allocate_pirq(unsigned gsi, int shareable, char *name); int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name); #ifdef CONFIG_PCI_MSI -int xen_allocate_pirq_msi(char *name, int *pirq, int alloc_pirq); +int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc); +int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, + int pirq, const char *name); int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type); #endif From 8135591e90c81462a6902f6ffa1f1ca021db077a Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 18 Feb 2011 16:43:33 +0000 Subject: [PATCH 602/706] xen: events: refactor xen_create_msi_irq slightly Calling PHYSDEVOP_map_pirq earlier simplifies error handling and starts to make the tail end of this function look like xen_bind_pirq_msi_to_irq. Signed-off-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/events.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 1033f6284f31..b54285e27b3b 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -716,6 +716,12 @@ int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type) map_irq.entry_nr = msidesc->msi_attrib.entry_nr; } + rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); + if (rc) { + dev_warn(&dev->dev, "xen map irq failed %d\n", rc); + goto out; + } + spin_lock(&irq_mapping_update_lock); irq = xen_allocate_irq_dynamic(); @@ -723,15 +729,6 @@ int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type) if (irq == -1) goto out; - rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); - if (rc) { - printk(KERN_WARNING "xen map irq failed %d\n", rc); - - xen_free_irq(irq); - - irq = -1; - goto out; - } irq_info[irq] = mk_pirq_info(0, map_irq.pirq, 0, map_irq.index); set_irq_chip_and_handler_name(irq, &xen_pirq_chip, From 2e55288f63343f0810f4f0a3004f78037cfb93d3 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 18 Feb 2011 16:43:34 +0000 Subject: [PATCH 603/706] xen: events: update pirq_to_irq in xen_create_msi_irq I don't think this was a deliberate ommision. Makes the tail end of this function look even more like xen_bind_pirq_msi_to_irq. Signed-off-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/events.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index b54285e27b3b..721b393fd8aa 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -730,6 +730,7 @@ int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type) goto out; irq_info[irq] = mk_pirq_info(0, map_irq.pirq, 0, map_irq.index); + pirq_to_irq[map_irq.pirq] = irq; set_irq_chip_and_handler_name(irq, &xen_pirq_chip, handle_level_irq, From f420e010edd84eb2c237fc87b7451e69740fed46 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 18 Feb 2011 16:43:35 +0000 Subject: [PATCH 604/706] xen: events: push set_irq_msi down into xen_create_msi_irq Makes the tail end of this function look even more like xen_bind_pirq_msi_to_irq. Signed-off-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/pci/xen.c | 10 +--------- drivers/xen/events.c | 10 +++++++++- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 93e42152d8d0..15fd981d35f1 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -185,23 +185,15 @@ static void xen_teardown_msi_irq(unsigned int irq) #ifdef CONFIG_XEN_DOM0 static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { - int irq, ret; + int irq; struct msi_desc *msidesc; list_for_each_entry(msidesc, &dev->msi_list, list) { irq = xen_create_msi_irq(dev, msidesc, type); if (irq < 0) return -1; - - ret = set_irq_msi(irq, msidesc); - if (ret) - goto error; } return 0; - -error: - xen_destroy_irq(irq); - return ret; } #endif #endif diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 721b393fd8aa..77ede77a9ee9 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -691,7 +691,7 @@ error_irq: int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type) { - int irq = -1; + int ret, irq = -1; struct physdev_map_pirq map_irq; int rc; int pos; @@ -736,9 +736,17 @@ int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type) handle_level_irq, (type == PCI_CAP_ID_MSIX) ? "msi-x":"msi"); + ret = set_irq_msi(irq, msidesc); + if (ret) + goto out_irq; + out: spin_unlock(&irq_mapping_update_lock); return irq; +out_irq: + spin_unlock(&irq_mapping_update_lock); + xen_free_irq(irq); + return -1; } #endif From ca1d8fe9521fb67c95cfa736c08f4bbbc282b5bd Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 18 Feb 2011 16:43:36 +0000 Subject: [PATCH 605/706] xen: events: use xen_bind_pirq_msi_to_irq from xen_create_msi_irq Signed-off-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/pci/xen.c | 4 ++-- drivers/xen/events.c | 36 +++++++----------------------------- include/xen/events.h | 2 +- 3 files changed, 10 insertions(+), 32 deletions(-) diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 15fd981d35f1..ffd8c7a2cdbb 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -106,7 +106,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) dev_dbg(&dev->dev, "xen: msi already bound to pirq=%d\n", pirq); } - irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq, + irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq, 0, (type == PCI_CAP_ID_MSIX) ? "msi-x" : "msi"); if (irq < 0) @@ -145,7 +145,7 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) goto error; i = 0; list_for_each_entry(msidesc, &dev->msi_list, list) { - irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], + irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0, (type == PCI_CAP_ID_MSIX) ? "pcifront-msi-x" : "pcifront-msi"); diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 77ede77a9ee9..34469489087b 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -662,7 +662,7 @@ int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc) } int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, - int pirq, const char *name) + int pirq, int vector, const char *name) { int irq, ret; @@ -675,7 +675,7 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, set_irq_chip_and_handler_name(irq, &xen_pirq_chip, handle_level_irq, name); - irq_info[irq] = mk_pirq_info(0, pirq, 0, 0); + irq_info[irq] = mk_pirq_info(0, pirq, 0, vector); pirq_to_irq[pirq] = irq; ret = set_irq_msi(irq, msidesc); if (ret < 0) @@ -691,7 +691,6 @@ error_irq: int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type) { - int ret, irq = -1; struct physdev_map_pirq map_irq; int rc; int pos; @@ -719,34 +718,13 @@ int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type) rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); if (rc) { dev_warn(&dev->dev, "xen map irq failed %d\n", rc); - goto out; + return -1; } - spin_lock(&irq_mapping_update_lock); - - irq = xen_allocate_irq_dynamic(); - - if (irq == -1) - goto out; - - irq_info[irq] = mk_pirq_info(0, map_irq.pirq, 0, map_irq.index); - pirq_to_irq[map_irq.pirq] = irq; - - set_irq_chip_and_handler_name(irq, &xen_pirq_chip, - handle_level_irq, - (type == PCI_CAP_ID_MSIX) ? "msi-x":"msi"); - - ret = set_irq_msi(irq, msidesc); - if (ret) - goto out_irq; - -out: - spin_unlock(&irq_mapping_update_lock); - return irq; -out_irq: - spin_unlock(&irq_mapping_update_lock); - xen_free_irq(irq); - return -1; + return xen_bind_pirq_msi_to_irq(dev, msidesc, + map_irq.pirq, map_irq.index, + (type == PCI_CAP_ID_MSIX) ? + "msi-x" : "msi"); } #endif diff --git a/include/xen/events.h b/include/xen/events.h index 18bf825bac66..45c08a0d580a 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -77,7 +77,7 @@ int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name); #ifdef CONFIG_PCI_MSI int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc); int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, - int pirq, const char *name); + int pirq, int vector, const char *name); int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type); #endif From 71eef7d1e3d9df760897fdd2cad6949a8bcf1620 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 18 Feb 2011 17:06:55 +0000 Subject: [PATCH 606/706] xen: events: remove dom0 specific xen_create_msi_irq The function name does not distinguish it from xen_allocate_pirq_msi (which operates on domU and pvhvm domains rather than dom0). Hoist domain 0 specific functionality up into the only caller leaving functionality common to all guest types in xen_bind_pirq_msi_to_irq. Signed-off-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/pci/xen.c | 45 +++++++++++++++++++++++++++++++++++++++----- drivers/xen/events.c | 41 ---------------------------------------- include/xen/events.h | 1 - 3 files changed, 40 insertions(+), 47 deletions(-) diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index ffd8c7a2cdbb..8c4085a95ef1 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -185,15 +185,50 @@ static void xen_teardown_msi_irq(unsigned int irq) #ifdef CONFIG_XEN_DOM0 static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { - int irq; + int ret = 0; struct msi_desc *msidesc; list_for_each_entry(msidesc, &dev->msi_list, list) { - irq = xen_create_msi_irq(dev, msidesc, type); - if (irq < 0) - return -1; + struct physdev_map_pirq map_irq; + + memset(&map_irq, 0, sizeof(map_irq)); + map_irq.domid = DOMID_SELF; + map_irq.type = MAP_PIRQ_TYPE_MSI; + map_irq.index = -1; + map_irq.pirq = -1; + map_irq.bus = dev->bus->number; + map_irq.devfn = dev->devfn; + + if (type == PCI_CAP_ID_MSIX) { + int pos; + u32 table_offset, bir; + + pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); + + pci_read_config_dword(dev, pos + PCI_MSIX_TABLE, + &table_offset); + bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK); + + map_irq.table_base = pci_resource_start(dev, bir); + map_irq.entry_nr = msidesc->msi_attrib.entry_nr; + } + + ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); + if (ret) { + dev_warn(&dev->dev, "xen map irq failed %d\n", ret); + goto out; + } + + ret = xen_bind_pirq_msi_to_irq(dev, msidesc, + map_irq.pirq, map_irq.index, + (type == PCI_CAP_ID_MSIX) ? + "msi-x" : "msi"); + if (ret < 0) + goto out; } - return 0; + ret = 0; +out: + return ret; } #endif #endif diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 34469489087b..6befe6227159 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -644,9 +644,6 @@ out: } #ifdef CONFIG_PCI_MSI -#include -#include "../pci/msi.h" - int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc) { int rc; @@ -688,44 +685,6 @@ error_irq: xen_free_irq(irq); return -1; } - -int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type) -{ - struct physdev_map_pirq map_irq; - int rc; - int pos; - u32 table_offset, bir; - - memset(&map_irq, 0, sizeof(map_irq)); - map_irq.domid = DOMID_SELF; - map_irq.type = MAP_PIRQ_TYPE_MSI; - map_irq.index = -1; - map_irq.pirq = -1; - map_irq.bus = dev->bus->number; - map_irq.devfn = dev->devfn; - - if (type == PCI_CAP_ID_MSIX) { - pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); - - pci_read_config_dword(dev, msix_table_offset_reg(pos), - &table_offset); - bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK); - - map_irq.table_base = pci_resource_start(dev, bir); - map_irq.entry_nr = msidesc->msi_attrib.entry_nr; - } - - rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); - if (rc) { - dev_warn(&dev->dev, "xen map irq failed %d\n", rc); - return -1; - } - - return xen_bind_pirq_msi_to_irq(dev, msidesc, - map_irq.pirq, map_irq.index, - (type == PCI_CAP_ID_MSIX) ? - "msi-x" : "msi"); -} #endif int xen_destroy_irq(int irq) diff --git a/include/xen/events.h b/include/xen/events.h index 45c08a0d580a..962da2ced5b4 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -78,7 +78,6 @@ int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name); int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc); int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, int pirq, int vector, const char *name); -int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type); #endif /* De-allocates the above mentioned physical interrupt. */ From ec8df88f6bd808db47ac7a06c96dcc90d7ed6ecc Mon Sep 17 00:00:00 2001 From: Henrik Kretzschmar Date: Fri, 11 Mar 2011 08:02:35 +0100 Subject: [PATCH 607/706] x86: Remove superflous goal definition of tsc_sync The extra tsc_sync.o goal definition is superflous. CONFIG_X86_64_SMP depends on CONFIG_SMP and tsc_sync.o is already in the definition of CONFIG_SMP. Signed-off-by: Henrik Kretzschmar Acked-by: Cyrill Gorcunov LKML-Reference: <1299826956-8607-1-git-send-email-henne@nachtwindheim.de> Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 6ac5036adf42..62445ba2f8a8 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -66,9 +66,9 @@ obj-$(CONFIG_PCI) += early-quirks.o apm-y := apm_32.o obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_SMP) += smp.o -obj-$(CONFIG_SMP) += smpboot.o tsc_sync.o +obj-$(CONFIG_SMP) += smpboot.o +obj-$(CONFIG_SMP) += tsc_sync.o obj-$(CONFIG_SMP) += setup_percpu.o -obj-$(CONFIG_X86_64_SMP) += tsc_sync.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-y += apic/ From 25874a299ef8037df03ce4ada570bc4e42f9748f Mon Sep 17 00:00:00 2001 From: Henrik Kretzschmar Date: Fri, 11 Mar 2011 08:02:36 +0100 Subject: [PATCH 608/706] x86: Clean up apic.c and apic.h This patch moves some functions and variables into init sections, makes a function static and removes some lines of cruft. Signed-off-by: Henrik Kretzschmar Acked-by: Cyrill Gorcunov LKML-Reference: <1299826956-8607-2-git-send-email-henne@nachtwindheim.de> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 4 --- arch/x86/kernel/apic/apic.c | 65 +++++++++++++------------------------ 2 files changed, 22 insertions(+), 47 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 4afe512120a9..5b7d5137e167 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -220,7 +220,6 @@ extern void enable_IR_x2apic(void); extern int get_physical_broadcast(void); -extern void apic_disable(void); extern int lapic_get_maxlvt(void); extern void clear_local_APIC(void); extern void connect_bsp_APIC(void); @@ -228,7 +227,6 @@ extern void disconnect_bsp_APIC(int virt_wire_setup); extern void disable_local_APIC(void); extern void lapic_shutdown(void); extern int verify_local_APIC(void); -extern void cache_APIC_registers(void); extern void sync_Arb_IDs(void); extern void init_bsp_APIC(void); extern void setup_local_APIC(void); @@ -239,7 +237,6 @@ void register_lapic_address(unsigned long address); extern void setup_boot_APIC_clock(void); extern void setup_secondary_APIC_clock(void); extern int APIC_init_uniprocessor(void); -extern void enable_NMI_through_LVT0(void); extern int apic_force_enable(unsigned long addr); /* @@ -261,7 +258,6 @@ static inline void lapic_shutdown(void) { } #define local_apic_timer_c2_ok 1 static inline void init_apic_mappings(void) { } static inline void disable_local_APIC(void) { } -static inline void apic_disable(void) { } # define setup_boot_APIC_clock x86_init_noop # define setup_secondary_APIC_clock x86_init_noop #endif /* !CONFIG_X86_LOCAL_APIC */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 4f43312cfbf8..ffbf7c21bbc6 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -84,7 +84,7 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); * * +1=force-enable */ -static int force_enable_local_apic; +static int force_enable_local_apic __initdata; /* * APIC command line parameters */ @@ -154,7 +154,7 @@ early_param("nox2apic", setup_nox2apic); unsigned long mp_lapic_addr; int disable_apic; /* Disable local APIC timer from the kernel commandline or via dmi quirk */ -static int disable_apic_timer __cpuinitdata; +static int disable_apic_timer __initdata; /* Local APIC timer works in C2 */ int local_apic_timer_c2_ok; EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); @@ -178,29 +178,8 @@ static struct resource lapic_resource = { static unsigned int calibration_result; -static int lapic_next_event(unsigned long delta, - struct clock_event_device *evt); -static void lapic_timer_setup(enum clock_event_mode mode, - struct clock_event_device *evt); -static void lapic_timer_broadcast(const struct cpumask *mask); static void apic_pm_activate(void); -/* - * The local apic timer can be used for any function which is CPU local. - */ -static struct clock_event_device lapic_clockevent = { - .name = "lapic", - .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT - | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY, - .shift = 32, - .set_mode = lapic_timer_setup, - .set_next_event = lapic_next_event, - .broadcast = lapic_timer_broadcast, - .rating = 100, - .irq = -1, -}; -static DEFINE_PER_CPU(struct clock_event_device, lapic_events); - static unsigned long apic_phys; /* @@ -239,7 +218,7 @@ static int modern_apic(void) * right after this call apic become NOOP driven * so apic->write/read doesn't do anything */ -void apic_disable(void) +static void __init apic_disable(void) { pr_info("APIC: switched to apic NOOP\n"); apic = &apic_noop; @@ -283,23 +262,6 @@ u64 native_apic_icr_read(void) return icr1 | ((u64)icr2 << 32); } -/** - * enable_NMI_through_LVT0 - enable NMI through local vector table 0 - */ -void __cpuinit enable_NMI_through_LVT0(void) -{ - unsigned int v; - - /* unmask and set to NMI */ - v = APIC_DM_NMI; - - /* Level triggered for 82489DX (32bit mode) */ - if (!lapic_is_integrated()) - v |= APIC_LVT_LEVEL_TRIGGER; - - apic_write(APIC_LVT0, v); -} - #ifdef CONFIG_X86_32 /** * get_physical_broadcast - Get number of physical broadcast IDs @@ -509,6 +471,23 @@ static void lapic_timer_broadcast(const struct cpumask *mask) #endif } + +/* + * The local apic timer can be used for any function which is CPU local. + */ +static struct clock_event_device lapic_clockevent = { + .name = "lapic", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT + | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY, + .shift = 32, + .set_mode = lapic_timer_setup, + .set_next_event = lapic_next_event, + .broadcast = lapic_timer_broadcast, + .rating = 100, + .irq = -1, +}; +static DEFINE_PER_CPU(struct clock_event_device, lapic_events); + /* * Setup the local APIC timer for this CPU. Copy the initialized values * of the boot CPU and register the clock event in the framework. @@ -1538,7 +1517,7 @@ static int __init detect_init_APIC(void) } #else -static int apic_verify(void) +static int __init apic_verify(void) { u32 features, h, l; @@ -1563,7 +1542,7 @@ static int apic_verify(void) return 0; } -int apic_force_enable(unsigned long addr) +int __init apic_force_enable(unsigned long addr) { u32 h, l; From c0c9ed15042ceac7c485813012a0a97316101b57 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 11 Mar 2011 11:51:22 +0100 Subject: [PATCH 609/706] futex: Avoid redudant evaluation of task_pid_vnr() The result is not going to change under us, so no need to reevaluate this over and over. Seems to be a leftover from the mechanical mass conversion of task->pid to task_pid_vnr(tsk). Signed-off-by: Thomas Gleixner --- kernel/futex.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 3184d3b9cadf..773815465bac 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -674,7 +674,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, struct task_struct *task, int set_waiters) { int lock_taken, ret, ownerdied = 0; - u32 uval, newval, curval; + u32 uval, newval, curval, vpid = task_pid_vnr(task); retry: ret = lock_taken = 0; @@ -684,7 +684,7 @@ retry: * (by doing a 0 -> TID atomic cmpxchg), while holding all * the locks. It will most likely not succeed. */ - newval = task_pid_vnr(task); + newval = vpid; if (set_waiters) newval |= FUTEX_WAITERS; @@ -696,7 +696,7 @@ retry: /* * Detect deadlocks. */ - if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task)))) + if ((unlikely((curval & FUTEX_TID_MASK) == vpid))) return -EDEADLK; /* @@ -723,7 +723,7 @@ retry: */ if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { /* Keep the OWNER_DIED bit */ - newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task); + newval = (curval & ~FUTEX_TID_MASK) | vpid; ownerdied = 0; lock_taken = 1; } @@ -2047,9 +2047,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) { struct futex_hash_bucket *hb; struct futex_q *this, *next; - u32 uval; struct plist_head *head; union futex_key key = FUTEX_KEY_INIT; + u32 uval, vpid = task_pid_vnr(current); int ret; retry: @@ -2058,7 +2058,7 @@ retry: /* * We release only a lock we actually own: */ - if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) + if ((uval & FUTEX_TID_MASK) != vpid) return -EPERM; ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); @@ -2074,7 +2074,7 @@ retry: * anyone else up: */ if (!(uval & FUTEX_OWNER_DIED)) - uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0); + uval = cmpxchg_futex_value_locked(uaddr, vpid, 0); if (unlikely(uval == -EFAULT)) @@ -2083,7 +2083,7 @@ retry: * Rare case: we managed to release the lock atomically, * no need to wake anyone else up: */ - if (unlikely(uval == task_pid_vnr(current))) + if (unlikely(uval == vpid)) goto out_unlock; /* From 522d7decc0370070448a8c28982c8dfd8970489e Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 10 Mar 2011 18:47:31 -0800 Subject: [PATCH 610/706] futex: Remove redundant pagefault_disable in futex_atomic_cmpxchg_inatomic() kernel/futex.c disables page faults before calling futex_atomic_cmpxchg_inatomic(), so there is no need to do it again within that function. Signed-off-by: Michel Lespinasse Cc: Darren Hart Cc: Peter Zijlstra Cc: Matt Turner Cc: Russell King Cc: David Howells Cc: Tony Luck Cc: Michal Simek Cc: Ralf Baechle Cc: "James E.J. Bottomley" Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: Paul Mundt Cc: "David S. Miller" Cc: Chris Metcalf Cc: Linus Torvalds LKML-Reference: <20110311024731.GB26122@google.com> Signed-off-by: Thomas Gleixner --- arch/arm/include/asm/futex.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h index b33fe7065b38..7133a8620830 100644 --- a/arch/arm/include/asm/futex.h +++ b/arch/arm/include/asm/futex.h @@ -95,7 +95,8 @@ futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) return -EFAULT; - pagefault_disable(); /* implies preempt_disable() */ + /* Note that preemption is disabled by futex_atomic_cmpxchg_inatomic + * call sites. */ __asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n" "1: " T(ldr) " %0, [%3]\n" @@ -115,8 +116,6 @@ futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) : "r" (oldval), "r" (newval), "r" (uaddr), "Ir" (-EFAULT) : "cc", "memory"); - pagefault_enable(); /* subsumes preempt_enable() */ - return val; } From 37a9d912b24f96a0591773e6e6c3642991ae5a70 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 10 Mar 2011 18:48:51 -0800 Subject: [PATCH 611/706] futex: Sanitize cmpxchg_futex_value_locked API The cmpxchg_futex_value_locked API was funny in that it returned either the original, user-exposed futex value OR an error code such as -EFAULT. This was confusing at best, and could be a source of livelocks in places that retry the cmpxchg_futex_value_locked after trying to fix the issue by running fault_in_user_writeable(). This change makes the cmpxchg_futex_value_locked API more similar to the get_futex_value_locked one, returning an error code and updating the original value through a reference argument. Signed-off-by: Michel Lespinasse Acked-by: Chris Metcalf [tile] Acked-by: Tony Luck [ia64] Acked-by: Thomas Gleixner Tested-by: Michal Simek [microblaze] Acked-by: David Howells [frv] Cc: Darren Hart Cc: Peter Zijlstra Cc: Matt Turner Cc: Russell King Cc: Ralf Baechle Cc: "James E.J. Bottomley" Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: Paul Mundt Cc: "David S. Miller" Cc: Linus Torvalds LKML-Reference: <20110311024851.GC26122@google.com> Signed-off-by: Thomas Gleixner --- arch/alpha/include/asm/futex.h | 22 +++++++------- arch/arm/include/asm/futex.h | 18 +++++++----- arch/frv/include/asm/futex.h | 3 +- arch/ia64/include/asm/futex.h | 9 ++++-- arch/microblaze/include/asm/futex.h | 24 ++++++++------- arch/mips/include/asm/futex.h | 32 ++++++++++---------- arch/parisc/include/asm/futex.h | 18 ++++++------ arch/powerpc/include/asm/futex.h | 20 +++++++------ arch/s390/include/asm/futex.h | 4 +-- arch/s390/include/asm/uaccess.h | 2 +- arch/s390/lib/uaccess.h | 4 +-- arch/s390/lib/uaccess_pt.c | 13 +++++---- arch/s390/lib/uaccess_std.c | 6 ++-- arch/sh/include/asm/futex-irq.h | 9 +++--- arch/sh/include/asm/futex.h | 5 ++-- arch/sparc/include/asm/futex_64.h | 16 ++++++---- arch/tile/include/asm/futex.h | 7 +++-- arch/x86/include/asm/futex.h | 16 +++++----- include/asm-generic/futex.h | 3 +- kernel/futex.c | 45 ++++++++++------------------- 20 files changed, 144 insertions(+), 132 deletions(-) diff --git a/arch/alpha/include/asm/futex.h b/arch/alpha/include/asm/futex.h index 945de222ab91..c4e5c2850cce 100644 --- a/arch/alpha/include/asm/futex.h +++ b/arch/alpha/include/asm/futex.h @@ -81,21 +81,22 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr) } static inline int -futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) +futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr, + int oldval, int newval) { - int prev, cmp; + int ret = 0, prev, cmp; if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) return -EFAULT; __asm__ __volatile__ ( __ASM_SMP_MB - "1: ldl_l %0,0(%2)\n" - " cmpeq %0,%3,%1\n" - " beq %1,3f\n" - " mov %4,%1\n" - "2: stl_c %1,0(%2)\n" - " beq %1,4f\n" + "1: ldl_l %1,0(%3)\n" + " cmpeq %1,%4,%2\n" + " beq %2,3f\n" + " mov %5,%2\n" + "2: stl_c %2,0(%3)\n" + " beq %2,4f\n" "3: .subsection 2\n" "4: br 1b\n" " .previous\n" @@ -105,11 +106,12 @@ futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) " .long 2b-.\n" " lda $31,3b-2b(%0)\n" " .previous\n" - : "=&r"(prev), "=&r"(cmp) + : "+r"(ret), "=&r"(prev), "=&r"(cmp) : "r"(uaddr), "r"((long)oldval), "r"(newval) : "memory"); - return prev; + *uval = prev; + return ret; } #endif /* __KERNEL__ */ diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h index 7133a8620830..d20b78fce758 100644 --- a/arch/arm/include/asm/futex.h +++ b/arch/arm/include/asm/futex.h @@ -88,9 +88,10 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr) } static inline int -futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) +futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr, + int oldval, int newval) { - int val; + int ret = 0, val; if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) return -EFAULT; @@ -99,24 +100,25 @@ futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) * call sites. */ __asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n" - "1: " T(ldr) " %0, [%3]\n" - " teq %0, %1\n" + "1: " T(ldr) " %1, [%4]\n" + " teq %1, %2\n" " it eq @ explicit IT needed for the 2b label\n" - "2: " T(streq) " %2, [%3]\n" + "2: " T(streq) " %3, [%4]\n" "3:\n" " .pushsection __ex_table,\"a\"\n" " .align 3\n" " .long 1b, 4f, 2b, 4f\n" " .popsection\n" " .pushsection .fixup,\"ax\"\n" - "4: mov %0, %4\n" + "4: mov %0, %5\n" " b 3b\n" " .popsection" - : "=&r" (val) + : "+r" (ret), "=&r" (val) : "r" (oldval), "r" (newval), "r" (uaddr), "Ir" (-EFAULT) : "cc", "memory"); - return val; + *uval = val; + return ret; } #endif /* !SMP */ diff --git a/arch/frv/include/asm/futex.h b/arch/frv/include/asm/futex.h index 08b3d1da3583..0548f8e4d11e 100644 --- a/arch/frv/include/asm/futex.h +++ b/arch/frv/include/asm/futex.h @@ -10,7 +10,8 @@ extern int futex_atomic_op_inuser(int encoded_op, int __user *uaddr); static inline int -futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) +futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr, + int oldval, int newval) { return -ENOSYS; } diff --git a/arch/ia64/include/asm/futex.h b/arch/ia64/include/asm/futex.h index c7f0f062239c..b0728404dad0 100644 --- a/arch/ia64/include/asm/futex.h +++ b/arch/ia64/include/asm/futex.h @@ -100,23 +100,26 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr) } static inline int -futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) +futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr, + int oldval, int newval) { if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) return -EFAULT; { - register unsigned long r8 __asm ("r8"); + register unsigned long r8 __asm ("r8") = 0; + unsigned long prev; __asm__ __volatile__( " mf;; \n" " mov ar.ccv=%3;; \n" "[1:] cmpxchg4.acq %0=[%1],%2,ar.ccv \n" " .xdata4 \"__ex_table\", 1b-., 2f-. \n" "[2:]" - : "=r" (r8) + : "=r" (prev) : "r" (uaddr), "r" (newval), "rO" ((long) (unsigned) oldval) : "memory"); + *uval = prev; return r8; } } diff --git a/arch/microblaze/include/asm/futex.h b/arch/microblaze/include/asm/futex.h index ad3fd61b2fe7..fa019ed65dfb 100644 --- a/arch/microblaze/include/asm/futex.h +++ b/arch/microblaze/include/asm/futex.h @@ -94,31 +94,33 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr) } static inline int -futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) +futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr, + int oldval, int newval) { - int prev, cmp; + int ret = 0, prev, cmp; if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) return -EFAULT; - __asm__ __volatile__ ("1: lwx %0, %2, r0; \ - cmp %1, %0, %3; \ - beqi %1, 3f; \ - 2: swx %4, %2, r0; \ - addic %1, r0, 0; \ - bnei %1, 1b; \ + __asm__ __volatile__ ("1: lwx %1, %3, r0; \ + cmp %2, %1, %4; \ + beqi %2, 3f; \ + 2: swx %5, %3, r0; \ + addic %2, r0, 0; \ + bnei %2, 1b; \ 3: \ .section .fixup,\"ax\"; \ 4: brid 3b; \ - addik %0, r0, %5; \ + addik %0, r0, %6; \ .previous; \ .section __ex_table,\"a\"; \ .word 1b,4b,2b,4b; \ .previous;" \ - : "=&r" (prev), "=&r"(cmp) \ + : "+r" (ret), "=&r" (prev), "=&r"(cmp) \ : "r" (uaddr), "r" (oldval), "r" (newval), "i" (-EFAULT)); - return prev; + *uval = prev; + return ret; } #endif /* __KERNEL__ */ diff --git a/arch/mips/include/asm/futex.h b/arch/mips/include/asm/futex.h index b9cce90346cf..692a24bd83b7 100644 --- a/arch/mips/include/asm/futex.h +++ b/arch/mips/include/asm/futex.h @@ -132,9 +132,10 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr) } static inline int -futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) +futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr, + int oldval, int newval) { - int retval; + int ret = 0, val; if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) return -EFAULT; @@ -145,25 +146,25 @@ futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) " .set push \n" " .set noat \n" " .set mips3 \n" - "1: ll %0, %2 \n" - " bne %0, %z3, 3f \n" + "1: ll %1, %3 \n" + " bne %1, %z4, 3f \n" " .set mips0 \n" - " move $1, %z4 \n" + " move $1, %z5 \n" " .set mips3 \n" - "2: sc $1, %1 \n" + "2: sc $1, %2 \n" " beqzl $1, 1b \n" __WEAK_LLSC_MB "3: \n" " .set pop \n" " .section .fixup,\"ax\" \n" - "4: li %0, %5 \n" + "4: li %0, %6 \n" " j 3b \n" " .previous \n" " .section __ex_table,\"a\" \n" " "__UA_ADDR "\t1b, 4b \n" " "__UA_ADDR "\t2b, 4b \n" " .previous \n" - : "=&r" (retval), "=R" (*uaddr) + : "+r" (ret), "=&r" (val), "=R" (*uaddr) : "R" (*uaddr), "Jr" (oldval), "Jr" (newval), "i" (-EFAULT) : "memory"); } else if (cpu_has_llsc) { @@ -172,31 +173,32 @@ futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) " .set push \n" " .set noat \n" " .set mips3 \n" - "1: ll %0, %2 \n" - " bne %0, %z3, 3f \n" + "1: ll %1, %3 \n" + " bne %1, %z4, 3f \n" " .set mips0 \n" - " move $1, %z4 \n" + " move $1, %z5 \n" " .set mips3 \n" - "2: sc $1, %1 \n" + "2: sc $1, %2 \n" " beqz $1, 1b \n" __WEAK_LLSC_MB "3: \n" " .set pop \n" " .section .fixup,\"ax\" \n" - "4: li %0, %5 \n" + "4: li %0, %6 \n" " j 3b \n" " .previous \n" " .section __ex_table,\"a\" \n" " "__UA_ADDR "\t1b, 4b \n" " "__UA_ADDR "\t2b, 4b \n" " .previous \n" - : "=&r" (retval), "=R" (*uaddr) + : "+r" (ret), "=&r" (val), "=R" (*uaddr) : "R" (*uaddr), "Jr" (oldval), "Jr" (newval), "i" (-EFAULT) : "memory"); } else return -ENOSYS; - return retval; + *uval = val; + return ret; } #endif diff --git a/arch/parisc/include/asm/futex.h b/arch/parisc/include/asm/futex.h index 0c705c3a55ef..4c6d8672325b 100644 --- a/arch/parisc/include/asm/futex.h +++ b/arch/parisc/include/asm/futex.h @@ -51,10 +51,10 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr) /* Non-atomic version */ static inline int -futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) +futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr, + int oldval, int newval) { - int err = 0; - int uval; + int val; /* futex.c wants to do a cmpxchg_inatomic on kernel NULL, which is * our gateway page, and causes no end of trouble... @@ -65,12 +65,12 @@ futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) return -EFAULT; - err = get_user(uval, uaddr); - if (err) return -EFAULT; - if (uval == oldval) - err = put_user(newval, uaddr); - if (err) return -EFAULT; - return uval; + if (get_user(val, uaddr)) + return -EFAULT; + if (val == oldval && put_user(newval, uaddr)) + return -EFAULT; + *uval = val; + return 0; } #endif /*__KERNEL__*/ diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h index 7c589ef81fb0..631e8da60064 100644 --- a/arch/powerpc/include/asm/futex.h +++ b/arch/powerpc/include/asm/futex.h @@ -82,35 +82,37 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr) } static inline int -futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) +futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr, + int oldval, int newval) { - int prev; + int ret = 0, prev; if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) return -EFAULT; __asm__ __volatile__ ( PPC_RELEASE_BARRIER -"1: lwarx %0,0,%2 # futex_atomic_cmpxchg_inatomic\n\ - cmpw 0,%0,%3\n\ +"1: lwarx %1,0,%3 # futex_atomic_cmpxchg_inatomic\n\ + cmpw 0,%1,%4\n\ bne- 3f\n" - PPC405_ERR77(0,%2) -"2: stwcx. %4,0,%2\n\ + PPC405_ERR77(0,%3) +"2: stwcx. %5,0,%3\n\ bne- 1b\n" PPC_ACQUIRE_BARRIER "3: .section .fixup,\"ax\"\n\ -4: li %0,%5\n\ +4: li %0,%6\n\ b 3b\n\ .previous\n\ .section __ex_table,\"a\"\n\ .align 3\n\ " PPC_LONG "1b,4b,2b,4b\n\ .previous" \ - : "=&r" (prev), "+m" (*uaddr) + : "+r" (ret), "=&r" (prev), "+m" (*uaddr) : "r" (uaddr), "r" (oldval), "r" (newval), "i" (-EFAULT) : "cc", "memory"); - return prev; + *uval = prev; + return ret; } #endif /* __KERNEL__ */ diff --git a/arch/s390/include/asm/futex.h b/arch/s390/include/asm/futex.h index 5c5d02de49e9..27ac515ef59c 100644 --- a/arch/s390/include/asm/futex.h +++ b/arch/s390/include/asm/futex.h @@ -39,13 +39,13 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr) return ret; } -static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, +static inline int futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr, int oldval, int newval) { if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int))) return -EFAULT; - return uaccess.futex_atomic_cmpxchg(uaddr, oldval, newval); + return uaccess.futex_atomic_cmpxchg(uval, uaddr, oldval, newval); } #endif /* __KERNEL__ */ diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index d6b1ed0ec52b..549adf6a9b8b 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -84,7 +84,7 @@ struct uaccess_ops { size_t (*strnlen_user)(size_t, const char __user *); size_t (*strncpy_from_user)(size_t, const char __user *, char *); int (*futex_atomic_op)(int op, int __user *, int oparg, int *old); - int (*futex_atomic_cmpxchg)(int __user *, int old, int new); + int (*futex_atomic_cmpxchg)(int *, int __user *, int old, int new); }; extern struct uaccess_ops uaccess; diff --git a/arch/s390/lib/uaccess.h b/arch/s390/lib/uaccess.h index 126011df14f1..89a80674e44b 100644 --- a/arch/s390/lib/uaccess.h +++ b/arch/s390/lib/uaccess.h @@ -12,12 +12,12 @@ extern size_t copy_from_user_std(size_t, const void __user *, void *); extern size_t copy_to_user_std(size_t, void __user *, const void *); extern size_t strnlen_user_std(size_t, const char __user *); extern size_t strncpy_from_user_std(size_t, const char __user *, char *); -extern int futex_atomic_cmpxchg_std(int __user *, int, int); +extern int futex_atomic_cmpxchg_std(int *, int __user *, int, int); extern int futex_atomic_op_std(int, int __user *, int, int *); extern size_t copy_from_user_pt(size_t, const void __user *, void *); extern size_t copy_to_user_pt(size_t, void __user *, const void *); extern int futex_atomic_op_pt(int, int __user *, int, int *); -extern int futex_atomic_cmpxchg_pt(int __user *, int, int); +extern int futex_atomic_cmpxchg_pt(int *, int __user *, int, int); #endif /* __ARCH_S390_LIB_UACCESS_H */ diff --git a/arch/s390/lib/uaccess_pt.c b/arch/s390/lib/uaccess_pt.c index 404f2de296dc..b3cebcd52f5c 100644 --- a/arch/s390/lib/uaccess_pt.c +++ b/arch/s390/lib/uaccess_pt.c @@ -354,26 +354,29 @@ int futex_atomic_op_pt(int op, int __user *uaddr, int oparg, int *old) return ret; } -static int __futex_atomic_cmpxchg_pt(int __user *uaddr, int oldval, int newval) +static int __futex_atomic_cmpxchg_pt(int *uval, int __user *uaddr, + int oldval, int newval) { int ret; asm volatile("0: cs %1,%4,0(%5)\n" - "1: lr %0,%1\n" + "1: la %0,0\n" "2:\n" EX_TABLE(0b,2b) EX_TABLE(1b,2b) : "=d" (ret), "+d" (oldval), "=m" (*uaddr) : "0" (-EFAULT), "d" (newval), "a" (uaddr), "m" (*uaddr) : "cc", "memory" ); + *uval = oldval; return ret; } -int futex_atomic_cmpxchg_pt(int __user *uaddr, int oldval, int newval) +int futex_atomic_cmpxchg_pt(int *uval, int __user *uaddr, + int oldval, int newval) { int ret; if (segment_eq(get_fs(), KERNEL_DS)) - return __futex_atomic_cmpxchg_pt(uaddr, oldval, newval); + return __futex_atomic_cmpxchg_pt(uval, uaddr, oldval, newval); spin_lock(¤t->mm->page_table_lock); uaddr = (int __user *) __dat_user_addr((unsigned long) uaddr); if (!uaddr) { @@ -382,7 +385,7 @@ int futex_atomic_cmpxchg_pt(int __user *uaddr, int oldval, int newval) } get_page(virt_to_page(uaddr)); spin_unlock(¤t->mm->page_table_lock); - ret = __futex_atomic_cmpxchg_pt(uaddr, oldval, newval); + ret = __futex_atomic_cmpxchg_pt(uval, uaddr, oldval, newval); put_page(virt_to_page(uaddr)); return ret; } diff --git a/arch/s390/lib/uaccess_std.c b/arch/s390/lib/uaccess_std.c index a6c4f7ed24a4..1d6643c0b95f 100644 --- a/arch/s390/lib/uaccess_std.c +++ b/arch/s390/lib/uaccess_std.c @@ -287,19 +287,21 @@ int futex_atomic_op_std(int op, int __user *uaddr, int oparg, int *old) return ret; } -int futex_atomic_cmpxchg_std(int __user *uaddr, int oldval, int newval) +int futex_atomic_cmpxchg_std(int *uval, int __user *uaddr, + int oldval, int newval) { int ret; asm volatile( " sacf 256\n" "0: cs %1,%4,0(%5)\n" - "1: lr %0,%1\n" + "1: la %0,0\n" "2: sacf 0\n" EX_TABLE(0b,2b) EX_TABLE(1b,2b) : "=d" (ret), "+d" (oldval), "=m" (*uaddr) : "0" (-EFAULT), "d" (newval), "a" (uaddr), "m" (*uaddr) : "cc", "memory" ); + *uval = oldval; return ret; } diff --git a/arch/sh/include/asm/futex-irq.h b/arch/sh/include/asm/futex-irq.h index a9f16a7f9aea..7b701cbd1e84 100644 --- a/arch/sh/include/asm/futex-irq.h +++ b/arch/sh/include/asm/futex-irq.h @@ -88,7 +88,8 @@ static inline int atomic_futex_op_xchg_xor(int oparg, int __user *uaddr, return ret; } -static inline int atomic_futex_op_cmpxchg_inatomic(int __user *uaddr, +static inline int atomic_futex_op_cmpxchg_inatomic(int *uval, + int __user *uaddr, int oldval, int newval) { unsigned long flags; @@ -102,10 +103,8 @@ static inline int atomic_futex_op_cmpxchg_inatomic(int __user *uaddr, local_irq_restore(flags); - if (ret) - return ret; - - return prev; + *uval = prev; + return ret; } #endif /* __ASM_SH_FUTEX_IRQ_H */ diff --git a/arch/sh/include/asm/futex.h b/arch/sh/include/asm/futex.h index 68256ec5fa35..a8a5125dc9b4 100644 --- a/arch/sh/include/asm/futex.h +++ b/arch/sh/include/asm/futex.h @@ -65,12 +65,13 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) } static inline int -futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) +futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr, + int oldval, int newval) { if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) return -EFAULT; - return atomic_futex_op_cmpxchg_inatomic(uaddr, oldval, newval); + return atomic_futex_op_cmpxchg_inatomic(uval, uaddr, oldval, newval); } #endif /* __KERNEL__ */ diff --git a/arch/sparc/include/asm/futex_64.h b/arch/sparc/include/asm/futex_64.h index 47f95839dc69..e0862200d6a1 100644 --- a/arch/sparc/include/asm/futex_64.h +++ b/arch/sparc/include/asm/futex_64.h @@ -85,26 +85,30 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) } static inline int -futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) +futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr, + int oldval, int newval) { + int ret = 0; + __asm__ __volatile__( - "\n1: casa [%3] %%asi, %2, %0\n" + "\n1: casa [%4] %%asi, %3, %1\n" "2:\n" " .section .fixup,#alloc,#execinstr\n" " .align 4\n" "3: sethi %%hi(2b), %0\n" " jmpl %0 + %%lo(2b), %%g0\n" - " mov %4, %0\n" + " mov %5, %0\n" " .previous\n" " .section __ex_table,\"a\"\n" " .align 4\n" " .word 1b, 3b\n" " .previous\n" - : "=r" (newval) - : "0" (newval), "r" (oldval), "r" (uaddr), "i" (-EFAULT) + : "+r" (ret), "=r" (newval) + : "1" (newval), "r" (oldval), "r" (uaddr), "i" (-EFAULT) : "memory"); - return newval; + *uval = newval; + return ret; } #endif /* !(_SPARC64_FUTEX_H) */ diff --git a/arch/tile/include/asm/futex.h b/arch/tile/include/asm/futex.h index fe0d10dcae57..664b20aa2584 100644 --- a/arch/tile/include/asm/futex.h +++ b/arch/tile/include/asm/futex.h @@ -119,8 +119,8 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) return ret; } -static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, - int newval) +static inline int futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr, + int oldval, int newval) { struct __get_user asm_ret; @@ -128,7 +128,8 @@ static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, return -EFAULT; asm_ret = futex_cmpxchg(uaddr, oldval, newval); - return asm_ret.err ? asm_ret.err : asm_ret.val; + *uval = asm_ret.val; + return asm_ret.err; } #ifndef __tilegx__ diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h index 1f11ce44e956..884c0b5676f4 100644 --- a/arch/x86/include/asm/futex.h +++ b/arch/x86/include/asm/futex.h @@ -109,9 +109,10 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) return ret; } -static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, - int newval) +static inline int futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr, + int oldval, int newval) { + int ret = 0; #if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP) /* Real i386 machines have no cmpxchg instruction */ @@ -122,18 +123,19 @@ static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) return -EFAULT; - asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %3, %1\n" + asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n" "2:\t.section .fixup, \"ax\"\n" - "3:\tmov %2, %0\n" + "3:\tmov %3, %0\n" "\tjmp 2b\n" "\t.previous\n" _ASM_EXTABLE(1b, 3b) - : "=a" (oldval), "+m" (*uaddr) - : "i" (-EFAULT), "r" (newval), "0" (oldval) + : "+r" (ret), "=a" (oldval), "+m" (*uaddr) + : "i" (-EFAULT), "r" (newval), "1" (oldval) : "memory" ); - return oldval; + *uval = oldval; + return ret; } #endif diff --git a/include/asm-generic/futex.h b/include/asm-generic/futex.h index 3c2344f48136..132bf5227b44 100644 --- a/include/asm-generic/futex.h +++ b/include/asm-generic/futex.h @@ -48,7 +48,8 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr) } static inline int -futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) +futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr, + int oldval, int newval) { return -ENOSYS; } diff --git a/kernel/futex.c b/kernel/futex.c index 773815465bac..237f14bfc022 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -381,15 +381,16 @@ static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, return NULL; } -static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) +static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr, + u32 uval, u32 newval) { - u32 curval; + int ret; pagefault_disable(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); + ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); pagefault_enable(); - return curval; + return ret; } static int get_futex_value_locked(u32 *dest, u32 __user *from) @@ -688,9 +689,7 @@ retry: if (set_waiters) newval |= FUTEX_WAITERS; - curval = cmpxchg_futex_value_locked(uaddr, 0, newval); - - if (unlikely(curval == -EFAULT)) + if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval))) return -EFAULT; /* @@ -728,9 +727,7 @@ retry: lock_taken = 1; } - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); - - if (unlikely(curval == -EFAULT)) + if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) return -EFAULT; if (unlikely(curval != uval)) goto retry; @@ -843,9 +840,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) newval = FUTEX_WAITERS | task_pid_vnr(new_owner); - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); - - if (curval == -EFAULT) + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) ret = -EFAULT; else if (curval != uval) ret = -EINVAL; @@ -880,10 +875,8 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval) * There is no waiter, so we unlock the futex. The owner died * bit has not to be preserved here. We are the owner: */ - oldval = cmpxchg_futex_value_locked(uaddr, uval, 0); - - if (oldval == -EFAULT) - return oldval; + if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0)) + return -EFAULT; if (oldval != uval) return -EAGAIN; @@ -1578,9 +1571,7 @@ retry: while (1) { newval = (uval & FUTEX_OWNER_DIED) | newtid; - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); - - if (curval == -EFAULT) + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) goto handle_fault; if (curval == uval) break; @@ -2073,11 +2064,8 @@ retry: * again. If it succeeds then we can return without waking * anyone else up: */ - if (!(uval & FUTEX_OWNER_DIED)) - uval = cmpxchg_futex_value_locked(uaddr, vpid, 0); - - - if (unlikely(uval == -EFAULT)) + if (!(uval & FUTEX_OWNER_DIED) && + cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) goto pi_faulted; /* * Rare case: we managed to release the lock atomically, @@ -2464,9 +2452,7 @@ retry: * userspace. */ mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; - nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); - - if (nval == -EFAULT) + if (futex_atomic_cmpxchg_inatomic(&nval, uaddr, uval, mval)) return -1; if (nval != uval) @@ -2679,8 +2665,7 @@ static int __init futex_init(void) * implementation, the non-functional ones will return * -ENOSYS. */ - curval = cmpxchg_futex_value_locked(NULL, 0, 0); - if (curval == -EFAULT) + if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) futex_cmpxchg_enabled = 1; for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { From 8d7718aa082aaf30a0b4989e1f04858952f941bc Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 10 Mar 2011 18:50:58 -0800 Subject: [PATCH 612/706] futex: Sanitize futex ops argument types Change futex_atomic_op_inuser and futex_atomic_cmpxchg_inatomic prototypes to use u32 types for the futex as this is the data type the futex core code uses all over the place. Signed-off-by: Michel Lespinasse Cc: Darren Hart Cc: Peter Zijlstra Cc: Matt Turner Cc: Russell King Cc: David Howells Cc: Tony Luck Cc: Michal Simek Cc: Ralf Baechle Cc: "James E.J. Bottomley" Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: Paul Mundt Cc: "David S. Miller" Cc: Chris Metcalf Cc: Linus Torvalds LKML-Reference: <20110311025058.GD26122@google.com> Signed-off-by: Thomas Gleixner --- arch/alpha/include/asm/futex.h | 13 +++++++------ arch/arm/include/asm/futex.h | 13 +++++++------ arch/frv/include/asm/futex.h | 6 +++--- arch/frv/kernel/futex.c | 14 +++++++------- arch/ia64/include/asm/futex.h | 10 +++++----- arch/microblaze/include/asm/futex.h | 13 +++++++------ arch/mips/include/asm/futex.h | 13 +++++++------ arch/parisc/include/asm/futex.h | 12 ++++++------ arch/powerpc/include/asm/futex.h | 13 +++++++------ arch/s390/include/asm/futex.h | 10 +++++----- arch/s390/include/asm/uaccess.h | 4 ++-- arch/s390/lib/uaccess.h | 8 ++++---- arch/s390/lib/uaccess_pt.c | 12 ++++++------ arch/s390/lib/uaccess_std.c | 6 +++--- arch/sh/include/asm/futex-irq.h | 19 ++++++++++--------- arch/sh/include/asm/futex.h | 10 +++++----- arch/sparc/include/asm/futex_64.h | 8 ++++---- arch/tile/include/asm/futex.h | 24 ++++++++++++------------ arch/x86/include/asm/futex.h | 10 +++++----- include/asm-generic/futex.h | 8 ++++---- 20 files changed, 116 insertions(+), 110 deletions(-) diff --git a/arch/alpha/include/asm/futex.h b/arch/alpha/include/asm/futex.h index c4e5c2850cce..e8a761aee088 100644 --- a/arch/alpha/include/asm/futex.h +++ b/arch/alpha/include/asm/futex.h @@ -29,7 +29,7 @@ : "r" (uaddr), "r"(oparg) \ : "memory") -static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr) +static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; @@ -39,7 +39,7 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr) if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; pagefault_disable(); @@ -81,12 +81,13 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr) } static inline int -futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr, - int oldval, int newval) +futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { - int ret = 0, prev, cmp; + int ret = 0, cmp; + u32 prev; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; __asm__ __volatile__ ( diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h index d20b78fce758..0e29d8e6a5c2 100644 --- a/arch/arm/include/asm/futex.h +++ b/arch/arm/include/asm/futex.h @@ -35,7 +35,7 @@ : "cc", "memory") static inline int -futex_atomic_op_inuser (int encoded_op, int __user *uaddr) +futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; @@ -46,7 +46,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr) if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; pagefault_disable(); /* implies preempt_disable() */ @@ -88,12 +88,13 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr) } static inline int -futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr, - int oldval, int newval) +futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { - int ret = 0, val; + int ret = 0; + u32 val; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; /* Note that preemption is disabled by futex_atomic_cmpxchg_inatomic diff --git a/arch/frv/include/asm/futex.h b/arch/frv/include/asm/futex.h index 0548f8e4d11e..4bea27f50a7a 100644 --- a/arch/frv/include/asm/futex.h +++ b/arch/frv/include/asm/futex.h @@ -7,11 +7,11 @@ #include #include -extern int futex_atomic_op_inuser(int encoded_op, int __user *uaddr); +extern int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr); static inline int -futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr, - int oldval, int newval) +futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { return -ENOSYS; } diff --git a/arch/frv/kernel/futex.c b/arch/frv/kernel/futex.c index 14f64b054c7e..d155ca9e5098 100644 --- a/arch/frv/kernel/futex.c +++ b/arch/frv/kernel/futex.c @@ -18,7 +18,7 @@ * the various futex operations; MMU fault checking is ignored under no-MMU * conditions */ -static inline int atomic_futex_op_xchg_set(int oparg, int __user *uaddr, int *_oldval) +static inline int atomic_futex_op_xchg_set(int oparg, u32 __user *uaddr, int *_oldval) { int oldval, ret; @@ -50,7 +50,7 @@ static inline int atomic_futex_op_xchg_set(int oparg, int __user *uaddr, int *_o return ret; } -static inline int atomic_futex_op_xchg_add(int oparg, int __user *uaddr, int *_oldval) +static inline int atomic_futex_op_xchg_add(int oparg, u32 __user *uaddr, int *_oldval) { int oldval, ret; @@ -83,7 +83,7 @@ static inline int atomic_futex_op_xchg_add(int oparg, int __user *uaddr, int *_o return ret; } -static inline int atomic_futex_op_xchg_or(int oparg, int __user *uaddr, int *_oldval) +static inline int atomic_futex_op_xchg_or(int oparg, u32 __user *uaddr, int *_oldval) { int oldval, ret; @@ -116,7 +116,7 @@ static inline int atomic_futex_op_xchg_or(int oparg, int __user *uaddr, int *_ol return ret; } -static inline int atomic_futex_op_xchg_and(int oparg, int __user *uaddr, int *_oldval) +static inline int atomic_futex_op_xchg_and(int oparg, u32 __user *uaddr, int *_oldval) { int oldval, ret; @@ -149,7 +149,7 @@ static inline int atomic_futex_op_xchg_and(int oparg, int __user *uaddr, int *_o return ret; } -static inline int atomic_futex_op_xchg_xor(int oparg, int __user *uaddr, int *_oldval) +static inline int atomic_futex_op_xchg_xor(int oparg, u32 __user *uaddr, int *_oldval) { int oldval, ret; @@ -186,7 +186,7 @@ static inline int atomic_futex_op_xchg_xor(int oparg, int __user *uaddr, int *_o /* * do the futex operations */ -int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) +int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; @@ -197,7 +197,7 @@ int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; pagefault_disable(); diff --git a/arch/ia64/include/asm/futex.h b/arch/ia64/include/asm/futex.h index b0728404dad0..8428525ddb22 100644 --- a/arch/ia64/include/asm/futex.h +++ b/arch/ia64/include/asm/futex.h @@ -46,7 +46,7 @@ do { \ } while (0) static inline int -futex_atomic_op_inuser (int encoded_op, int __user *uaddr) +futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; @@ -56,7 +56,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr) if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; - if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int))) + if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; pagefault_disable(); @@ -100,10 +100,10 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr) } static inline int -futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr, - int oldval, int newval) +futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; { diff --git a/arch/microblaze/include/asm/futex.h b/arch/microblaze/include/asm/futex.h index fa019ed65dfb..b0526d2716fa 100644 --- a/arch/microblaze/include/asm/futex.h +++ b/arch/microblaze/include/asm/futex.h @@ -29,7 +29,7 @@ }) static inline int -futex_atomic_op_inuser(int encoded_op, int __user *uaddr) +futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; @@ -39,7 +39,7 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr) if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; pagefault_disable(); @@ -94,12 +94,13 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr) } static inline int -futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr, - int oldval, int newval) +futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { - int ret = 0, prev, cmp; + int ret = 0, cmp; + u32 prev; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; __asm__ __volatile__ ("1: lwx %1, %3, r0; \ diff --git a/arch/mips/include/asm/futex.h b/arch/mips/include/asm/futex.h index 692a24bd83b7..6ebf1734b411 100644 --- a/arch/mips/include/asm/futex.h +++ b/arch/mips/include/asm/futex.h @@ -75,7 +75,7 @@ } static inline int -futex_atomic_op_inuser(int encoded_op, int __user *uaddr) +futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; @@ -85,7 +85,7 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr) if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; - if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int))) + if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; pagefault_disable(); @@ -132,12 +132,13 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr) } static inline int -futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr, - int oldval, int newval) +futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { - int ret = 0, val; + int ret = 0; + u32 val; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; if (cpu_has_llsc && R10000_LLSC_WAR) { diff --git a/arch/parisc/include/asm/futex.h b/arch/parisc/include/asm/futex.h index 4c6d8672325b..67a33cc27ef2 100644 --- a/arch/parisc/include/asm/futex.h +++ b/arch/parisc/include/asm/futex.h @@ -8,7 +8,7 @@ #include static inline int -futex_atomic_op_inuser (int encoded_op, int __user *uaddr) +futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; @@ -18,7 +18,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr) if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; - if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int))) + if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; pagefault_disable(); @@ -51,10 +51,10 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr) /* Non-atomic version */ static inline int -futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr, - int oldval, int newval) +futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { - int val; + u32 val; /* futex.c wants to do a cmpxchg_inatomic on kernel NULL, which is * our gateway page, and causes no end of trouble... @@ -62,7 +62,7 @@ futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr, if (segment_eq(KERNEL_DS, get_fs()) && !uaddr) return -EFAULT; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; if (get_user(val, uaddr)) diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h index 631e8da60064..c94e4a3fe2ef 100644 --- a/arch/powerpc/include/asm/futex.h +++ b/arch/powerpc/include/asm/futex.h @@ -30,7 +30,7 @@ : "b" (uaddr), "i" (-EFAULT), "r" (oparg) \ : "cr0", "memory") -static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr) +static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; @@ -40,7 +40,7 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr) if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; - if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int))) + if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; pagefault_disable(); @@ -82,12 +82,13 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr) } static inline int -futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr, - int oldval, int newval) +futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { - int ret = 0, prev; + int ret = 0; + u32 prev; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; __asm__ __volatile__ ( diff --git a/arch/s390/include/asm/futex.h b/arch/s390/include/asm/futex.h index 27ac515ef59c..81cf36b691f1 100644 --- a/arch/s390/include/asm/futex.h +++ b/arch/s390/include/asm/futex.h @@ -7,7 +7,7 @@ #include #include -static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr) +static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; @@ -18,7 +18,7 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr) if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; - if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int))) + if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; pagefault_disable(); @@ -39,10 +39,10 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr) return ret; } -static inline int futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr, - int oldval, int newval) +static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { - if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int))) + if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; return uaccess.futex_atomic_cmpxchg(uval, uaddr, oldval, newval); diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index 549adf6a9b8b..2d9ea11f919a 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -83,8 +83,8 @@ struct uaccess_ops { size_t (*clear_user)(size_t, void __user *); size_t (*strnlen_user)(size_t, const char __user *); size_t (*strncpy_from_user)(size_t, const char __user *, char *); - int (*futex_atomic_op)(int op, int __user *, int oparg, int *old); - int (*futex_atomic_cmpxchg)(int *, int __user *, int old, int new); + int (*futex_atomic_op)(int op, u32 __user *, int oparg, int *old); + int (*futex_atomic_cmpxchg)(u32 *, u32 __user *, u32 old, u32 new); }; extern struct uaccess_ops uaccess; diff --git a/arch/s390/lib/uaccess.h b/arch/s390/lib/uaccess.h index 89a80674e44b..1d2536cb630b 100644 --- a/arch/s390/lib/uaccess.h +++ b/arch/s390/lib/uaccess.h @@ -12,12 +12,12 @@ extern size_t copy_from_user_std(size_t, const void __user *, void *); extern size_t copy_to_user_std(size_t, void __user *, const void *); extern size_t strnlen_user_std(size_t, const char __user *); extern size_t strncpy_from_user_std(size_t, const char __user *, char *); -extern int futex_atomic_cmpxchg_std(int *, int __user *, int, int); -extern int futex_atomic_op_std(int, int __user *, int, int *); +extern int futex_atomic_cmpxchg_std(u32 *, u32 __user *, u32, u32); +extern int futex_atomic_op_std(int, u32 __user *, int, int *); extern size_t copy_from_user_pt(size_t, const void __user *, void *); extern size_t copy_to_user_pt(size_t, void __user *, const void *); -extern int futex_atomic_op_pt(int, int __user *, int, int *); -extern int futex_atomic_cmpxchg_pt(int *, int __user *, int, int); +extern int futex_atomic_op_pt(int, u32 __user *, int, int *); +extern int futex_atomic_cmpxchg_pt(u32 *, u32 __user *, u32, u32); #endif /* __ARCH_S390_LIB_UACCESS_H */ diff --git a/arch/s390/lib/uaccess_pt.c b/arch/s390/lib/uaccess_pt.c index b3cebcd52f5c..74833831417f 100644 --- a/arch/s390/lib/uaccess_pt.c +++ b/arch/s390/lib/uaccess_pt.c @@ -302,7 +302,7 @@ fault: : "0" (-EFAULT), "d" (oparg), "a" (uaddr), \ "m" (*uaddr) : "cc" ); -static int __futex_atomic_op_pt(int op, int __user *uaddr, int oparg, int *old) +static int __futex_atomic_op_pt(int op, u32 __user *uaddr, int oparg, int *old) { int oldval = 0, newval, ret; @@ -335,7 +335,7 @@ static int __futex_atomic_op_pt(int op, int __user *uaddr, int oparg, int *old) return ret; } -int futex_atomic_op_pt(int op, int __user *uaddr, int oparg, int *old) +int futex_atomic_op_pt(int op, u32 __user *uaddr, int oparg, int *old) { int ret; @@ -354,8 +354,8 @@ int futex_atomic_op_pt(int op, int __user *uaddr, int oparg, int *old) return ret; } -static int __futex_atomic_cmpxchg_pt(int *uval, int __user *uaddr, - int oldval, int newval) +static int __futex_atomic_cmpxchg_pt(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { int ret; @@ -370,8 +370,8 @@ static int __futex_atomic_cmpxchg_pt(int *uval, int __user *uaddr, return ret; } -int futex_atomic_cmpxchg_pt(int *uval, int __user *uaddr, - int oldval, int newval) +int futex_atomic_cmpxchg_pt(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { int ret; diff --git a/arch/s390/lib/uaccess_std.c b/arch/s390/lib/uaccess_std.c index 1d6643c0b95f..bb1a7eed42ce 100644 --- a/arch/s390/lib/uaccess_std.c +++ b/arch/s390/lib/uaccess_std.c @@ -255,7 +255,7 @@ size_t strncpy_from_user_std(size_t size, const char __user *src, char *dst) : "0" (-EFAULT), "d" (oparg), "a" (uaddr), \ "m" (*uaddr) : "cc"); -int futex_atomic_op_std(int op, int __user *uaddr, int oparg, int *old) +int futex_atomic_op_std(int op, u32 __user *uaddr, int oparg, int *old) { int oldval = 0, newval, ret; @@ -287,8 +287,8 @@ int futex_atomic_op_std(int op, int __user *uaddr, int oparg, int *old) return ret; } -int futex_atomic_cmpxchg_std(int *uval, int __user *uaddr, - int oldval, int newval) +int futex_atomic_cmpxchg_std(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { int ret; diff --git a/arch/sh/include/asm/futex-irq.h b/arch/sh/include/asm/futex-irq.h index 7b701cbd1e84..6cb9f193a95e 100644 --- a/arch/sh/include/asm/futex-irq.h +++ b/arch/sh/include/asm/futex-irq.h @@ -3,7 +3,7 @@ #include -static inline int atomic_futex_op_xchg_set(int oparg, int __user *uaddr, +static inline int atomic_futex_op_xchg_set(int oparg, u32 __user *uaddr, int *oldval) { unsigned long flags; @@ -20,7 +20,7 @@ static inline int atomic_futex_op_xchg_set(int oparg, int __user *uaddr, return ret; } -static inline int atomic_futex_op_xchg_add(int oparg, int __user *uaddr, +static inline int atomic_futex_op_xchg_add(int oparg, u32 __user *uaddr, int *oldval) { unsigned long flags; @@ -37,7 +37,7 @@ static inline int atomic_futex_op_xchg_add(int oparg, int __user *uaddr, return ret; } -static inline int atomic_futex_op_xchg_or(int oparg, int __user *uaddr, +static inline int atomic_futex_op_xchg_or(int oparg, u32 __user *uaddr, int *oldval) { unsigned long flags; @@ -54,7 +54,7 @@ static inline int atomic_futex_op_xchg_or(int oparg, int __user *uaddr, return ret; } -static inline int atomic_futex_op_xchg_and(int oparg, int __user *uaddr, +static inline int atomic_futex_op_xchg_and(int oparg, u32 __user *uaddr, int *oldval) { unsigned long flags; @@ -71,7 +71,7 @@ static inline int atomic_futex_op_xchg_and(int oparg, int __user *uaddr, return ret; } -static inline int atomic_futex_op_xchg_xor(int oparg, int __user *uaddr, +static inline int atomic_futex_op_xchg_xor(int oparg, u32 __user *uaddr, int *oldval) { unsigned long flags; @@ -88,12 +88,13 @@ static inline int atomic_futex_op_xchg_xor(int oparg, int __user *uaddr, return ret; } -static inline int atomic_futex_op_cmpxchg_inatomic(int *uval, - int __user *uaddr, - int oldval, int newval) +static inline int atomic_futex_op_cmpxchg_inatomic(u32 *uval, + u32 __user *uaddr, + u32 oldval, u32 newval) { unsigned long flags; - int ret, prev = 0; + int ret; + u32 prev = 0; local_irq_save(flags); diff --git a/arch/sh/include/asm/futex.h b/arch/sh/include/asm/futex.h index a8a5125dc9b4..7be39a646fbd 100644 --- a/arch/sh/include/asm/futex.h +++ b/arch/sh/include/asm/futex.h @@ -10,7 +10,7 @@ /* XXX: UP variants, fix for SH-4A and SMP.. */ #include -static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) +static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; @@ -21,7 +21,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; pagefault_disable(); @@ -65,10 +65,10 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) } static inline int -futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr, - int oldval, int newval) +futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; return atomic_futex_op_cmpxchg_inatomic(uval, uaddr, oldval, newval); diff --git a/arch/sparc/include/asm/futex_64.h b/arch/sparc/include/asm/futex_64.h index e0862200d6a1..444e7bea23bc 100644 --- a/arch/sparc/include/asm/futex_64.h +++ b/arch/sparc/include/asm/futex_64.h @@ -30,7 +30,7 @@ : "r" (uaddr), "r" (oparg), "i" (-EFAULT) \ : "memory") -static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) +static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; @@ -38,7 +38,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret, tem; - if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))) + if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) return -EFAULT; if (unlikely((((unsigned long) uaddr) & 0x3UL))) return -EINVAL; @@ -85,8 +85,8 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) } static inline int -futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr, - int oldval, int newval) +futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { int ret = 0; diff --git a/arch/tile/include/asm/futex.h b/arch/tile/include/asm/futex.h index 664b20aa2584..d03ec124a598 100644 --- a/arch/tile/include/asm/futex.h +++ b/arch/tile/include/asm/futex.h @@ -29,16 +29,16 @@ #include #include -extern struct __get_user futex_set(int __user *v, int i); -extern struct __get_user futex_add(int __user *v, int n); -extern struct __get_user futex_or(int __user *v, int n); -extern struct __get_user futex_andn(int __user *v, int n); -extern struct __get_user futex_cmpxchg(int __user *v, int o, int n); +extern struct __get_user futex_set(u32 __user *v, int i); +extern struct __get_user futex_add(u32 __user *v, int n); +extern struct __get_user futex_or(u32 __user *v, int n); +extern struct __get_user futex_andn(u32 __user *v, int n); +extern struct __get_user futex_cmpxchg(u32 __user *v, int o, int n); #ifndef __tilegx__ -extern struct __get_user futex_xor(int __user *v, int n); +extern struct __get_user futex_xor(u32 __user *v, int n); #else -static inline struct __get_user futex_xor(int __user *uaddr, int n) +static inline struct __get_user futex_xor(u32 __user *uaddr, int n) { struct __get_user asm_ret = __get_user_4(uaddr); if (!asm_ret.err) { @@ -53,7 +53,7 @@ static inline struct __get_user futex_xor(int __user *uaddr, int n) } #endif -static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) +static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; @@ -65,7 +65,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; pagefault_disable(); @@ -119,12 +119,12 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) return ret; } -static inline int futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr, - int oldval, int newval) +static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { struct __get_user asm_ret; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; asm_ret = futex_cmpxchg(uaddr, oldval, newval); diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h index 884c0b5676f4..d09bb03653f0 100644 --- a/arch/x86/include/asm/futex.h +++ b/arch/x86/include/asm/futex.h @@ -37,7 +37,7 @@ "+m" (*uaddr), "=&r" (tem) \ : "r" (oparg), "i" (-EFAULT), "1" (0)) -static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) +static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; @@ -48,7 +48,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; #if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP) @@ -109,8 +109,8 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) return ret; } -static inline int futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr, - int oldval, int newval) +static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { int ret = 0; @@ -120,7 +120,7 @@ static inline int futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr, return -ENOSYS; #endif - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n" diff --git a/include/asm-generic/futex.h b/include/asm-generic/futex.h index 132bf5227b44..01f227e14254 100644 --- a/include/asm-generic/futex.h +++ b/include/asm-generic/futex.h @@ -6,7 +6,7 @@ #include static inline int -futex_atomic_op_inuser (int encoded_op, int __user *uaddr) +futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; @@ -16,7 +16,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr) if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; - if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int))) + if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; pagefault_disable(); @@ -48,8 +48,8 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr) } static inline int -futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr, - int oldval, int newval) +futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { return -ENOSYS; } From d9936bb3952a08d701f7b03f8f62d158f94d8085 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 11 Mar 2011 14:15:35 +0100 Subject: [PATCH 613/706] genirq: Add desc->irq_data accessor We have accessors for all fields in irq_data based on irq_desc, but not for irq_data itself. Signed-off-by: Thomas Gleixner --- include/linux/irqdesc.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 9eb9cd313052..00218371518b 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -109,6 +109,11 @@ static inline struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) #ifdef CONFIG_GENERIC_HARDIRQS +static inline struct irq_data *irq_desc_get_irq_data(struct irq_desc *desc) +{ + return &desc->irq_data; +} + static inline struct irq_chip *irq_desc_get_chip(struct irq_desc *desc) { return desc->irq_data.chip; From 86b32122fd54addc9af01f8b919c65d3f49090a3 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 9 Mar 2011 22:03:16 -0500 Subject: [PATCH 614/706] xen/e820: Don't mark balloon memory as E820_UNUSABLE when running as guest and fix overflow. If we have a guest that asked for: memory=1024 maxmem=2048 Which means we want 1GB now, and create pagetables so that we can expand up to 2GB, we would have this E820 layout: [ 0.000000] BIOS-provided physical RAM map: [ 0.000000] Xen: 0000000000000000 - 00000000000a0000 (usable) [ 0.000000] Xen: 00000000000a0000 - 0000000000100000 (reserved) [ 0.000000] Xen: 0000000000100000 - 0000000080800000 (usable) Due to patch: "xen/setup: Inhibit resource API from using System RAM E820 gaps as PCI mem gaps." we would mark the memory past the 1GB mark as unusuable resulting in: [ 0.000000] BIOS-provided physical RAM map: [ 0.000000] Xen: 0000000000000000 - 00000000000a0000 (usable) [ 0.000000] Xen: 00000000000a0000 - 0000000000100000 (reserved) [ 0.000000] Xen: 0000000000100000 - 0000000040000000 (usable) [ 0.000000] Xen: 0000000040000000 - 0000000080800000 (unusable) which meant that we could not balloon up anymore. We could balloon the guest down. The fix is to run the code introduced by the above mentioned patch only for the initial domain. We will have to revisit this once we start introducing a modified E820 for PCI passthrough so that we can utilize the P2M identity code. We also fix an overflow by having UL instead of ULL on 32-bit machines. [v2: Ian pointed to the overflow issue] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 2a4add9634ce..010ba2e62de5 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -200,7 +200,8 @@ char * __init xen_memory_setup(void) * used as potential resource for I/O address (happens * when 'allocate_resource' is called). */ - if (delta && end < 0x100000000UL) + if (delta && + (xen_initial_domain() && end < 0x100000000ULL)) e820_add_region(end, delta, E820_UNUSABLE); } From 2e12978a9f7a7abd54e8eb9ce70a7718767b8b2c Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 22 Dec 2010 14:18:50 +0800 Subject: [PATCH 615/706] futex,plist: Pass the real head of the priority list to plist_del() Some plist_del()s in kernel/futex.c are passed a faked head of the priority list. It does not fail because the current code does not require the real head in plist_del(). The current code of plist_del() just uses the head for checking, so it will not cause a bad result even when we use a faked head. But it is undocumented usage: /** * plist_del - Remove a @node from plist. * * @node: &struct plist_node pointer - entry to be removed * @head: &struct plist_head pointer - list head */ The document says that the @head is the "list head" head of the priority list. In futex code, several places use "plist_del(&q->list, &q->list.plist);", they pass a fake head. We need to fix them all. Thanks to Darren Hart for many suggestions. Acked-by: Darren Hart Signed-off-by: Lai Jiangshan LKML-Reference: <4D11984A.5030203@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/futex.c | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index b766d28accd6..6feeea4f8f15 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -775,6 +775,24 @@ retry: return ret; } +/** + * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket + * @q: The futex_q to unqueue + * + * The q->lock_ptr must not be NULL and must be held by the caller. + */ +static void __unqueue_futex(struct futex_q *q) +{ + struct futex_hash_bucket *hb; + + if (WARN_ON(!q->lock_ptr || !spin_is_locked(q->lock_ptr) + || plist_node_empty(&q->list))) + return; + + hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); + plist_del(&q->list, &hb->chain); +} + /* * The hash bucket lock must be held when this is called. * Afterwards, the futex_q must not be accessed. @@ -792,7 +810,7 @@ static void wake_futex(struct futex_q *q) */ get_task_struct(p); - plist_del(&q->list, &q->list.plist); + __unqueue_futex(q); /* * The waiting task can free the futex_q as soon as * q->lock_ptr = NULL is written, without taking any locks. A @@ -1100,8 +1118,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, get_futex_key_refs(key); q->key = *key; - WARN_ON(plist_node_empty(&q->list)); - plist_del(&q->list, &q->list.plist); + __unqueue_futex(q); WARN_ON(!q->rt_waiter); q->rt_waiter = NULL; @@ -1504,8 +1521,7 @@ retry: spin_unlock(lock_ptr); goto retry; } - WARN_ON(plist_node_empty(&q->list)); - plist_del(&q->list, &q->list.plist); + __unqueue_futex(q); BUG_ON(q->pi_state); @@ -1525,8 +1541,7 @@ retry: static void unqueue_me_pi(struct futex_q *q) __releases(q->lock_ptr) { - WARN_ON(plist_node_empty(&q->list)); - plist_del(&q->list, &q->list.plist); + __unqueue_futex(q); BUG_ON(!q->pi_state); free_pi_state(q->pi_state); @@ -2167,7 +2182,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * We were woken prior to requeue by a timeout or a signal. * Unqueue the futex_q and determine which it was. */ - plist_del(&q->list, &q->list.plist); + plist_del(&q->list, &hb->chain); /* Handle spurious wakeups gracefully */ ret = -EWOULDBLOCK; From 017f2b239dabb2740b91df162e004371b861f371 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 21 Dec 2010 17:55:10 +0800 Subject: [PATCH 616/706] futex,plist: Remove debug lock assignment from plist_node The original code uses &plist_node->plist as the fake head of the priority list for plist_del(), these debug locks in the fake head are needed for CONFIG_DEBUG_PI_LIST. But now we always pass the real head to plist_del(), the debug locks in plist_node will not be used, so we remove these assignments. Acked-by: Darren Hart Signed-off-by: Lai Jiangshan LKML-Reference: <4D10797E.7040803@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/futex.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 6feeea4f8f15..9fe913141ec9 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1089,9 +1089,6 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, plist_del(&q->list, &hb1->chain); plist_add(&q->list, &hb2->chain); q->lock_ptr = &hb2->lock; -#ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.spinlock = &hb2->lock; -#endif } get_futex_key_refs(key2); q->key = *key2; @@ -1124,9 +1121,6 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, q->rt_waiter = NULL; q->lock_ptr = &hb->lock; -#ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.spinlock = &hb->lock; -#endif wake_up_state(q->task, TASK_NORMAL); } @@ -1474,9 +1468,6 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) prio = min(current->normal_prio, MAX_RT_PRIO); plist_node_init(&q->list, prio); -#ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.spinlock = &hb->lock; -#endif plist_add(&q->list, &hb->chain); q->task = current; spin_unlock(&hb->lock); From bf6a9b8336ba12672755c2ae898b0abe42c7a5ac Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 21 Dec 2010 17:55:14 +0800 Subject: [PATCH 617/706] plist: Shrink struct plist_head struct plist_head is used in struct task_struct as well as struct rtmutex. If we can make it smaller, it will also make these structures smaller as well. The field prio_list in struct plist_head is seldom used and we can get its information from the plist_nodes. Removing this field will decrease the size of plist_head by half. Signed-off-by: Lai Jiangshan LKML-Reference: <4D107982.9090700@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- include/linux/plist.h | 47 +++++++++++++++++++------------------ lib/plist.c | 54 ++++++++++++++++++++++++++++--------------- 2 files changed, 59 insertions(+), 42 deletions(-) diff --git a/include/linux/plist.h b/include/linux/plist.h index 7254eda078e5..c9b9f322c8d8 100644 --- a/include/linux/plist.h +++ b/include/linux/plist.h @@ -31,15 +31,17 @@ * * Simple ASCII art explanation: * - * |HEAD | - * | | - * |prio_list.prev|<------------------------------------| - * |prio_list.next|<->|pl|<->|pl|<--------------->|pl|<-| - * |10 | |10| |21| |21| |21| |40| (prio) - * | | | | | | | | | | | | - * | | | | | | | | | | | | - * |node_list.next|<->|nl|<->|nl|<->|nl|<->|nl|<->|nl|<-| - * |node_list.prev|<------------------------------------| + * pl:prio_list (only for plist_node) + * nl:node_list + * HEAD| NODE(S) + * | + * ||------------------------------------| + * ||->|pl|<->|pl|<--------------->|pl|<-| + * | |10| |21| |21| |21| |40| (prio) + * | | | | | | | | | | | + * | | | | | | | | | | | + * |->|nl|<->|nl|<->|nl|<->|nl|<->|nl|<->|nl|<-| + * |-------------------------------------------| * * The nodes on the prio_list list are sorted by priority to simplify * the insertion of new nodes. There are no nodes with duplicate @@ -78,7 +80,6 @@ #include struct plist_head { - struct list_head prio_list; struct list_head node_list; #ifdef CONFIG_DEBUG_PI_LIST raw_spinlock_t *rawlock; @@ -88,7 +89,8 @@ struct plist_head { struct plist_node { int prio; - struct plist_head plist; + struct list_head prio_list; + struct list_head node_list; }; #ifdef CONFIG_DEBUG_PI_LIST @@ -100,7 +102,6 @@ struct plist_node { #endif #define _PLIST_HEAD_INIT(head) \ - .prio_list = LIST_HEAD_INIT((head).prio_list), \ .node_list = LIST_HEAD_INIT((head).node_list) /** @@ -133,7 +134,8 @@ struct plist_node { #define PLIST_NODE_INIT(node, __prio) \ { \ .prio = (__prio), \ - .plist = { _PLIST_HEAD_INIT((node).plist) }, \ + .prio_list = LIST_HEAD_INIT((node).prio_list), \ + .node_list = LIST_HEAD_INIT((node).node_list), \ } /** @@ -144,7 +146,6 @@ struct plist_node { static inline void plist_head_init(struct plist_head *head, spinlock_t *lock) { - INIT_LIST_HEAD(&head->prio_list); INIT_LIST_HEAD(&head->node_list); #ifdef CONFIG_DEBUG_PI_LIST head->spinlock = lock; @@ -160,7 +161,6 @@ plist_head_init(struct plist_head *head, spinlock_t *lock) static inline void plist_head_init_raw(struct plist_head *head, raw_spinlock_t *lock) { - INIT_LIST_HEAD(&head->prio_list); INIT_LIST_HEAD(&head->node_list); #ifdef CONFIG_DEBUG_PI_LIST head->rawlock = lock; @@ -176,7 +176,8 @@ plist_head_init_raw(struct plist_head *head, raw_spinlock_t *lock) static inline void plist_node_init(struct plist_node *node, int prio) { node->prio = prio; - plist_head_init(&node->plist, NULL); + INIT_LIST_HEAD(&node->prio_list); + INIT_LIST_HEAD(&node->node_list); } extern void plist_add(struct plist_node *node, struct plist_head *head); @@ -188,7 +189,7 @@ extern void plist_del(struct plist_node *node, struct plist_head *head); * @head: the head for your list */ #define plist_for_each(pos, head) \ - list_for_each_entry(pos, &(head)->node_list, plist.node_list) + list_for_each_entry(pos, &(head)->node_list, node_list) /** * plist_for_each_safe - iterate safely over a plist of given type @@ -199,7 +200,7 @@ extern void plist_del(struct plist_node *node, struct plist_head *head); * Iterate over a plist of given type, safe against removal of list entry. */ #define plist_for_each_safe(pos, n, head) \ - list_for_each_entry_safe(pos, n, &(head)->node_list, plist.node_list) + list_for_each_entry_safe(pos, n, &(head)->node_list, node_list) /** * plist_for_each_entry - iterate over list of given type @@ -208,7 +209,7 @@ extern void plist_del(struct plist_node *node, struct plist_head *head); * @mem: the name of the list_struct within the struct */ #define plist_for_each_entry(pos, head, mem) \ - list_for_each_entry(pos, &(head)->node_list, mem.plist.node_list) + list_for_each_entry(pos, &(head)->node_list, mem.node_list) /** * plist_for_each_entry_safe - iterate safely over list of given type @@ -220,7 +221,7 @@ extern void plist_del(struct plist_node *node, struct plist_head *head); * Iterate over list of given type, safe against removal of list entry. */ #define plist_for_each_entry_safe(pos, n, head, m) \ - list_for_each_entry_safe(pos, n, &(head)->node_list, m.plist.node_list) + list_for_each_entry_safe(pos, n, &(head)->node_list, m.node_list) /** * plist_head_empty - return !0 if a plist_head is empty @@ -237,7 +238,7 @@ static inline int plist_head_empty(const struct plist_head *head) */ static inline int plist_node_empty(const struct plist_node *node) { - return plist_head_empty(&node->plist); + return list_empty(&node->node_list); } /* All functions below assume the plist_head is not empty. */ @@ -285,7 +286,7 @@ static inline int plist_node_empty(const struct plist_node *node) static inline struct plist_node *plist_first(const struct plist_head *head) { return list_entry(head->node_list.next, - struct plist_node, plist.node_list); + struct plist_node, node_list); } /** @@ -297,7 +298,7 @@ static inline struct plist_node *plist_first(const struct plist_head *head) static inline struct plist_node *plist_last(const struct plist_head *head) { return list_entry(head->node_list.prev, - struct plist_node, plist.node_list); + struct plist_node, node_list); } #endif diff --git a/lib/plist.c b/lib/plist.c index 1471988d9190..8c614d0e6c35 100644 --- a/lib/plist.c +++ b/lib/plist.c @@ -59,7 +59,8 @@ static void plist_check_head(struct plist_head *head) WARN_ON_SMP(!raw_spin_is_locked(head->rawlock)); if (head->spinlock) WARN_ON_SMP(!spin_is_locked(head->spinlock)); - plist_check_list(&head->prio_list); + if (!plist_head_empty(head)) + plist_check_list(&plist_first(head)->prio_list); plist_check_list(&head->node_list); } @@ -75,25 +76,33 @@ static void plist_check_head(struct plist_head *head) */ void plist_add(struct plist_node *node, struct plist_head *head) { - struct plist_node *iter; + struct plist_node *first, *iter, *prev = NULL; + struct list_head *node_next = &head->node_list; plist_check_head(head); WARN_ON(!plist_node_empty(node)); + WARN_ON(!list_empty(&node->prio_list)); - list_for_each_entry(iter, &head->prio_list, plist.prio_list) { - if (node->prio < iter->prio) - goto lt_prio; - else if (node->prio == iter->prio) { - iter = list_entry(iter->plist.prio_list.next, - struct plist_node, plist.prio_list); - goto eq_prio; + if (plist_head_empty(head)) + goto ins_node; + + first = iter = plist_first(head); + + do { + if (node->prio < iter->prio) { + node_next = &iter->node_list; + break; } - } -lt_prio: - list_add_tail(&node->plist.prio_list, &iter->plist.prio_list); -eq_prio: - list_add_tail(&node->plist.node_list, &iter->plist.node_list); + prev = iter; + iter = list_entry(iter->prio_list.next, + struct plist_node, prio_list); + } while (iter != first); + + if (!prev || prev->prio != node->prio) + list_add_tail(&node->prio_list, &iter->prio_list); +ins_node: + list_add_tail(&node->node_list, node_next); plist_check_head(head); } @@ -108,14 +117,21 @@ void plist_del(struct plist_node *node, struct plist_head *head) { plist_check_head(head); - if (!list_empty(&node->plist.prio_list)) { - struct plist_node *next = plist_first(&node->plist); + if (!list_empty(&node->prio_list)) { + if (node->node_list.next != &head->node_list) { + struct plist_node *next; - list_move_tail(&next->plist.prio_list, &node->plist.prio_list); - list_del_init(&node->plist.prio_list); + next = list_entry(node->node_list.next, + struct plist_node, node_list); + + /* add the next plist_node into prio_list */ + if (list_empty(&next->prio_list)) + list_add(&next->prio_list, &node->prio_list); + } + list_del_init(&node->prio_list); } - list_del_init(&node->plist.node_list); + list_del_init(&node->node_list); plist_check_head(head); } From 6d55da53db3d9b911f69f2ce1e5fb8943eafe057 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 21 Dec 2010 17:55:18 +0800 Subject: [PATCH 618/706] plist: Add priority list test Add test code for checking plist when the kernel is booting. Signed-off-by: Lai Jiangshan LKML-Reference: <4D107986.1010302@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- lib/plist.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 80 insertions(+), 1 deletion(-) diff --git a/lib/plist.c b/lib/plist.c index 8c614d0e6c35..0ae7e6431726 100644 --- a/lib/plist.c +++ b/lib/plist.c @@ -28,6 +28,8 @@ #ifdef CONFIG_DEBUG_PI_LIST +static struct plist_head test_head; + static void plist_check_prev_next(struct list_head *t, struct list_head *p, struct list_head *n) { @@ -54,7 +56,7 @@ static void plist_check_list(struct list_head *top) static void plist_check_head(struct plist_head *head) { - WARN_ON(!head->rawlock && !head->spinlock); + WARN_ON(head != &test_head && !head->rawlock && !head->spinlock); if (head->rawlock) WARN_ON_SMP(!raw_spin_is_locked(head->rawlock)); if (head->spinlock) @@ -135,3 +137,80 @@ void plist_del(struct plist_node *node, struct plist_head *head) plist_check_head(head); } + +#ifdef CONFIG_DEBUG_PI_LIST +#include +#include +#include + +static struct plist_node __initdata test_node[241]; + +static void __init plist_test_check(int nr_expect) +{ + struct plist_node *first, *prio_pos, *node_pos; + + if (plist_head_empty(&test_head)) { + BUG_ON(nr_expect != 0); + return; + } + + prio_pos = first = plist_first(&test_head); + plist_for_each(node_pos, &test_head) { + if (nr_expect-- < 0) + break; + if (node_pos == first) + continue; + if (node_pos->prio == prio_pos->prio) { + BUG_ON(!list_empty(&node_pos->prio_list)); + continue; + } + + BUG_ON(prio_pos->prio > node_pos->prio); + BUG_ON(prio_pos->prio_list.next != &node_pos->prio_list); + prio_pos = node_pos; + } + + BUG_ON(nr_expect != 0); + BUG_ON(prio_pos->prio_list.next != &first->prio_list); +} + +static int __init plist_test(void) +{ + int nr_expect = 0, i, loop; + unsigned int r = local_clock(); + + printk(KERN_INFO "start plist test\n"); + plist_head_init(&test_head, NULL); + for (i = 0; i < ARRAY_SIZE(test_node); i++) + plist_node_init(test_node + i, 0); + + for (loop = 0; loop < 1000; loop++) { + r = r * 193939 % 47629; + i = r % ARRAY_SIZE(test_node); + if (plist_node_empty(test_node + i)) { + r = r * 193939 % 47629; + test_node[i].prio = r % 99; + plist_add(test_node + i, &test_head); + nr_expect++; + } else { + plist_del(test_node + i, &test_head); + nr_expect--; + } + plist_test_check(nr_expect); + } + + for (i = 0; i < ARRAY_SIZE(test_node); i++) { + if (plist_node_empty(test_node + i)) + continue; + plist_del(test_node + i, &test_head); + nr_expect--; + plist_test_check(nr_expect); + } + + printk(KERN_INFO "end plist test\n"); + return 0; +} + +module_init(plist_test); + +#endif From 371c394af27ab7d1e58a66bc19d9f1f3ac1f67b4 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 11 Mar 2011 21:59:38 +0100 Subject: [PATCH 619/706] x86, binutils, xen: Fix another wrong size directive The latest binutils (2.21.0.20110302/Ubuntu) breaks the build yet another time, under CONFIG_XEN=y due to a .size directive that refers to a slightly differently named (hence, to the now very strict and unforgiving assembler, non-existent) symbol. [ mingo: This unnecessary build breakage caused by new binutils version 2.21 gets escallated back several kernel releases spanning several years of Linux history, affecting over 130,000 upstream kernel commits (!), on CONFIG_XEN=y 64-bit kernels (i.e. essentially affecting all major Linux distro kernel configs). Git annotate tells us that this slight debug symbol code mismatch bug has been introduced in 2008 in commit 3d75e1b8: 3d75e1b8 (Jeremy Fitzhardinge 2008-07-08 15:06:49 -0700 1231) ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) The 'bug' is just a slight assymetry in ENTRY()/END() debug-symbols sequences, with lots of assembly code between the ENTRY() and the END(): ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) ... END(do_hypervisor_callback) Human reviewers almost never catch such small mismatches, and binutils never even warned about it either. This new binutils version thus breaks the Xen build on all upstream kernels since v2.6.27, out of the blue. This makes a straightforward Git bisection of all 64-bit Xen-enabled kernels impossible on such binutils, for a bisection window of over hundred thousand historic commits. (!) This is a major fail on the side of binutils and binutils needs to turn this show-stopper build failure into a warning ASAP. ] Signed-off-by: Alexander van Heukelum Cc: Jeremy Fitzhardinge Cc: Jan Beulich Cc: H.J. Lu Cc: Linus Torvalds Cc: Andrew Morton Cc: "H. Peter Anvin" Cc: Kees Cook LKML-Reference: <1299877178-26063-1-git-send-email-heukelum@fastmail.fm> Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index aed1ffbeb0c9..bbd5c80cb09b 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1248,7 +1248,7 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) decl PER_CPU_VAR(irq_count) jmp error_exit CFI_ENDPROC -END(do_hypervisor_callback) +END(xen_do_hypervisor_callback) /* * Hypervisor uses this for application faults while it executes. From d209a699a0b975ad47f399d70ddc3791f1b84496 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 11 Mar 2011 21:22:14 +0100 Subject: [PATCH 620/706] genirq: Add chip flag to force mask on suspend On suspend we disable all interrupts in the core code, but this does not mask the interrupt line in the default implementation as we use a lazy disable approach. That means we mark the interrupt disabled, but leave the hardware unmasked. That's an optimization because we avoid the hardware access for the common case where no interrupt happens after we marked it disabled. If an interrupt happens, then the interrupt flow handler masks the line at the hardware level and marks it pending. Suspend makes use of this delayed disable as it "disables" all interrupts when preparing the suspend transition. Right before the system goes into hardware suspend state it checks whether one of the interrupts which is marked as a wakeup interrupt came in after disabling it. Most interrupt chips have a separate register which selects the interrupts which can wake up the system from suspend, so we don't have to mask any on the non wakeup interrupts. But now we have to deal with brilliant designed hardware which lacks such a wakeup configuration facility. For such hardware it's necessary to mask all non wakeup interrupts before going into suspend in order to avoid the wakeup from random interrupts. Rather than working around this in the affected interrupt chip implementations we can solve this elegant in the core code itself. Add a flag IRQCHIP_MASK_ON_SUSPEND which can be set by the irq chip implementation to indicate, that the interrupts which are not selected as wakeup sources must be masked in the suspend path. Mask them in the loop which checks the wakeup interrupts pending flag. Signed-off-by: Thomas Gleixner Reviewed-by: Abhijeet Dharmapurikar LKML-Reference: --- include/linux/irq.h | 2 ++ kernel/irq/pm.c | 22 ++++++++++++++++++---- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index ff62d0145b8f..1d3577f30d45 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -330,10 +330,12 @@ struct irq_chip { * * IRQCHIP_SET_TYPE_MASKED: Mask before calling chip.irq_set_type() * IRQCHIP_EOI_IF_HANDLED: Only issue irq_eoi() when irq was handled + * IRQCHIP_MASK_ON_SUSPEND: Mask non wake irqs in the suspend path */ enum { IRQCHIP_SET_TYPE_MASKED = (1 << 0), IRQCHIP_EOI_IF_HANDLED = (1 << 1), + IRQCHIP_MASK_ON_SUSPEND = (1 << 2), }; /* This include will go away once we isolated irq_desc usage to core code */ diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 1329f0eff49e..f76fc00c9877 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -68,10 +68,24 @@ int check_wakeup_irqs(void) struct irq_desc *desc; int irq; - for_each_irq_desc(irq, desc) - if (irqd_is_wakeup_set(&desc->irq_data) && - (desc->istate & IRQS_PENDING)) - return -EBUSY; + for_each_irq_desc(irq, desc) { + if (irqd_is_wakeup_set(&desc->irq_data)) { + if (desc->istate & IRQS_PENDING) + return -EBUSY; + continue; + } + /* + * Check the non wakeup interrupts whether they need + * to be masked before finally going into suspend + * state. That's for hardware which has no wakeup + * source configuration facility. The chip + * implementation indicates that with + * IRQCHIP_MASK_ON_SUSPEND. + */ + if (desc->istate & IRQS_SUSPENDED && + irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND) + mask_irq(desc); + } return 0; } From 56396e6823fe9b42fe9cf9403d6ed67756255f70 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 11 Mar 2011 10:33:31 +0100 Subject: [PATCH 621/706] x86-64, NUMA: Don't call numa_set_distanc() for all possible node combinations during emulation The distance transforming in numa_emulation() used to call numa_set_distance() for all MAX_NUMNODES * MAX_NUMNODES node combinations regardless of which are enabled. As numa_set_distance() ignores all out-of-bound distance settings, this doesn't cause any problem other than looping unnecessarily many times during boot. However, as MAX_NUMNODES * MAX_NUMNODES can be pretty high, update the code such that it iterates through only the enabled combinations. Yinghai Lu identified the issue and provided an initial patch to address the issue; however, the patch was incorrect in that it didn't build emulated distance table when there's no physical distance table and unnecessarily complex. http://thread.gmane.org/gmane.linux.kernel/1107986/focus=1107988 Signed-off-by: Tejun Heo Reported-by: Yinghai Lu Acked-by: Yinghai Lu --- arch/x86/mm/numa_emulation.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index 3696be0c2204..ad091e4cff17 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -301,7 +301,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) const u64 max_addr = max_pfn << PAGE_SHIFT; u8 *phys_dist = NULL; size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); - int dfl_phys_nid; + int max_emu_nid, dfl_phys_nid; int i, j, ret; if (!emu_cmdline) @@ -358,12 +358,17 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) node_distance(i, j); } - /* determine the default phys nid to use for unmapped nodes */ + /* + * Determine the max emulated nid and the default phys nid to use + * for unmapped nodes. + */ + max_emu_nid = 0; dfl_phys_nid = NUMA_NO_NODE; for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { if (emu_nid_to_phys[i] != NUMA_NO_NODE) { - dfl_phys_nid = emu_nid_to_phys[i]; - break; + max_emu_nid = i; + if (dfl_phys_nid == NUMA_NO_NODE) + dfl_phys_nid = emu_nid_to_phys[i]; } } if (dfl_phys_nid == NUMA_NO_NODE) { @@ -393,14 +398,10 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) if (emu_nid_to_phys[i] == NUMA_NO_NODE) emu_nid_to_phys[i] = dfl_phys_nid; - /* - * Transform distance table. numa_set_distance() ignores all - * out-of-bound distances. Just call it for every possible node - * combination. - */ + /* transform distance table */ numa_reset_distance(); - for (i = 0; i < MAX_NUMNODES; i++) { - for (j = 0; j < MAX_NUMNODES; j++) { + for (i = 0; i < max_emu_nid + 1; i++) { + for (j = 0; j < max_emu_nid + 1; j++) { int physi = emu_nid_to_phys[i]; int physj = emu_nid_to_phys[j]; int dist; From 2c778651f73d92edb847e65d371bb29b17c7ca60 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 12 Mar 2011 12:20:43 +0100 Subject: [PATCH 622/706] x86: Cleanup the genirq name space genirq is switching to a consistent name space for the irq related functions. Convert x86. Conversion was done with coccinelle. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 75 +++++++++++++------------- arch/x86/kernel/hpet.c | 2 +- arch/x86/kernel/i8259.c | 2 +- arch/x86/kernel/irqinit.c | 2 +- arch/x86/lguest/boot.c | 4 +- arch/x86/pci/xen.c | 8 +-- arch/x86/platform/uv/uv_irq.c | 4 +- arch/x86/platform/visws/visws_quirks.c | 2 +- 8 files changed, 51 insertions(+), 48 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 8d23e831a45e..7a88b04202e2 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -187,7 +187,7 @@ int __init arch_early_irq_init(void) irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs); for (i = 0; i < count; i++) { - set_irq_chip_data(i, &cfg[i]); + irq_set_chip_data(i, &cfg[i]); zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node); zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node); /* @@ -206,7 +206,7 @@ int __init arch_early_irq_init(void) #ifdef CONFIG_SPARSE_IRQ static struct irq_cfg *irq_cfg(unsigned int irq) { - return get_irq_chip_data(irq); + return irq_get_chip_data(irq); } static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node) @@ -232,7 +232,7 @@ static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) { if (!cfg) return; - set_irq_chip_data(at, NULL); + irq_set_chip_data(at, NULL); free_cpumask_var(cfg->domain); free_cpumask_var(cfg->old_domain); kfree(cfg); @@ -262,14 +262,14 @@ static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node) if (res < 0) { if (res != -EEXIST) return NULL; - cfg = get_irq_chip_data(at); + cfg = irq_get_chip_data(at); if (cfg) return cfg; } cfg = alloc_irq_cfg(at, node); if (cfg) - set_irq_chip_data(at, cfg); + irq_set_chip_data(at, cfg); else irq_free_desc(at); return cfg; @@ -1185,7 +1185,7 @@ void __setup_vector_irq(int cpu) raw_spin_lock(&vector_lock); /* Mark the inuse vectors */ for_each_active_irq(irq) { - cfg = get_irq_chip_data(irq); + cfg = irq_get_chip_data(irq); if (!cfg) continue; /* @@ -1249,25 +1249,24 @@ static void ioapic_register_intr(unsigned int irq, unsigned long trigger) else irq_clear_status_flags(irq, IRQ_LEVEL); - if (irq_remapped(get_irq_chip_data(irq))) { + if (irq_remapped(irq_get_chip_data(irq))) { irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); if (trigger) - set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, + irq_set_chip_and_handler_name(irq, &ir_ioapic_chip, handle_fasteoi_irq, - "fasteoi"); + "fasteoi"); else - set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, + irq_set_chip_and_handler_name(irq, &ir_ioapic_chip, handle_edge_irq, "edge"); return; } if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || trigger == IOAPIC_LEVEL) - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_fasteoi_irq, - "fasteoi"); + irq_set_chip_and_handler_name(irq, &ioapic_chip, + handle_fasteoi_irq, "fasteoi"); else - set_irq_chip_and_handler_name(irq, &ioapic_chip, + irq_set_chip_and_handler_name(irq, &ioapic_chip, handle_edge_irq, "edge"); } @@ -1491,7 +1490,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin, * The timer IRQ doesn't have to know that behind the * scene we may have a 8259A-master in AEOI mode ... */ - set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); + irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, + "edge"); /* * Add it to the IO-APIC irq-routing table: @@ -1598,7 +1598,7 @@ __apicdebuginit(void) print_IO_APIC(void) for_each_active_irq(irq) { struct irq_pin_list *entry; - cfg = get_irq_chip_data(irq); + cfg = irq_get_chip_data(irq); if (!cfg) continue; entry = cfg->irq_2_pin; @@ -2364,7 +2364,7 @@ static void irq_complete_move(struct irq_cfg *cfg) void irq_force_complete_move(int irq) { - struct irq_cfg *cfg = get_irq_chip_data(irq); + struct irq_cfg *cfg = irq_get_chip_data(irq); if (!cfg) return; @@ -2587,7 +2587,7 @@ static inline void init_IO_APIC_traps(void) * 0x80, because int 0x80 is hm, kind of importantish. ;) */ for_each_active_irq(irq) { - cfg = get_irq_chip_data(irq); + cfg = irq_get_chip_data(irq); if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { /* * Hmm.. We don't have an entry for this, @@ -2598,7 +2598,7 @@ static inline void init_IO_APIC_traps(void) legacy_pic->make_irq(irq); else /* Strange. Oh, well.. */ - set_irq_chip(irq, &no_irq_chip); + irq_set_chip(irq, &no_irq_chip); } } } @@ -2638,7 +2638,7 @@ static struct irq_chip lapic_chip __read_mostly = { static void lapic_register_intr(int irq) { irq_clear_status_flags(irq, IRQ_LEVEL); - set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, + irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, "edge"); } @@ -2722,7 +2722,7 @@ int timer_through_8259 __initdata; */ static inline void __init check_timer(void) { - struct irq_cfg *cfg = get_irq_chip_data(0); + struct irq_cfg *cfg = irq_get_chip_data(0); int node = cpu_to_node(0); int apic1, pin1, apic2, pin2; unsigned long flags; @@ -3033,7 +3033,7 @@ unsigned int create_irq_nr(unsigned int from, int node) raw_spin_unlock_irqrestore(&vector_lock, flags); if (ret) { - set_irq_chip_data(irq, cfg); + irq_set_chip_data(irq, cfg); irq_clear_status_flags(irq, IRQ_NOREQUEST); } else { free_irq_at(irq, cfg); @@ -3058,7 +3058,7 @@ int create_irq(void) void destroy_irq(unsigned int irq) { - struct irq_cfg *cfg = get_irq_chip_data(irq); + struct irq_cfg *cfg = irq_get_chip_data(irq); unsigned long flags; irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE); @@ -3092,7 +3092,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); - if (irq_remapped(get_irq_chip_data(irq))) { + if (irq_remapped(irq_get_chip_data(irq))) { struct irte irte; int ir_index; u16 sub_handle; @@ -3271,14 +3271,16 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) if (ret < 0) return ret; - set_irq_msi(irq, msidesc); + irq_set_msi_desc(irq, msidesc); write_msi_msg(irq, &msg); - if (irq_remapped(get_irq_chip_data(irq))) { + if (irq_remapped(irq_get_chip_data(irq))) { irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); + irq_set_chip_and_handler_name(irq, &msi_ir_chip, + handle_edge_irq, "edge"); } else - set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); + irq_set_chip_and_handler_name(irq, &msi_chip, + handle_edge_irq, "edge"); dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq); @@ -3396,8 +3398,8 @@ int arch_setup_dmar_msi(unsigned int irq) if (ret < 0) return ret; dmar_msi_write(irq, &msg); - set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, - "edge"); + irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, + "edge"); return 0; } #endif @@ -3474,13 +3476,13 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id) if (ret < 0) return ret; - hpet_msi_write(get_irq_data(irq), &msg); + hpet_msi_write(irq_get_handler_data(irq), &msg); irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - if (irq_remapped(get_irq_chip_data(irq))) - set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type, + if (irq_remapped(irq_get_chip_data(irq))) + irq_set_chip_and_handler_name(irq, &ir_hpet_msi_type, handle_edge_irq, "edge"); else - set_irq_chip_and_handler_name(irq, &hpet_msi_type, + irq_set_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq, "edge"); return 0; @@ -3569,7 +3571,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) write_ht_irq_msg(irq, &msg); - set_irq_chip_and_handler_name(irq, &ht_irq_chip, + irq_set_chip_and_handler_name(irq, &ht_irq_chip, handle_edge_irq, "edge"); dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); @@ -4054,5 +4056,6 @@ void __init pre_init_apic_IRQ0(void) setup_local_APIC(); io_apic_setup_irq_pin(0, 0, &attr); - set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); + irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, + "edge"); } diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 4ff5968f12d2..bfe8f729e086 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -503,7 +503,7 @@ static int hpet_assign_irq(struct hpet_dev *dev) if (!irq) return -EINVAL; - set_irq_data(irq, dev); + irq_set_handler_data(irq, dev); if (hpet_setup_msi_irq(irq)) return -EINVAL; diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 20757cb2efa3..d9ca749c123b 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -112,7 +112,7 @@ static void make_8259A_irq(unsigned int irq) { disable_irq_nosync(irq); io_apic_irqs &= ~(1<init(0); for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) - set_irq_chip_and_handler_name(i, chip, handle_level_irq, name); + irq_set_chip_and_handler_name(i, chip, handle_level_irq, name); } void __init init_IRQ(void) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index eba687f0cc0c..b9ec1c74943c 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -847,7 +847,7 @@ static void __init lguest_init_IRQ(void) void lguest_setup_irq(unsigned int irq) { irq_alloc_desc_at(irq, 0); - set_irq_chip_and_handler_name(irq, &lguest_irq_controller, + irq_set_chip_and_handler_name(irq, &lguest_irq_controller, handle_level_irq, "level"); } @@ -995,7 +995,7 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) static void lguest_time_init(void) { /* Set up the timer interrupt (0) to go to our simple timer routine */ - set_irq_handler(0, lguest_time_irq); + irq_set_handler(0, lguest_time_irq); clocksource_register(&lguest_clock); diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 25cd4a07d09f..2bdcc36e3166 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -104,7 +104,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) "msi-x" : "msi", &irq, &pirq, XEN_ALLOC_IRQ); if (irq < 0) goto error; - ret = set_irq_msi(irq, msidesc); + ret = irq_set_msi_desc(irq, msidesc); if (ret < 0) goto error_while; printk(KERN_DEBUG "xen: msi already setup: msi --> irq=%d" @@ -117,7 +117,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) goto error; printk(KERN_DEBUG "xen: msi --> irq=%d, pirq=%d\n", irq, pirq); xen_msi_compose_msg(dev, pirq, &msg); - ret = set_irq_msi(irq, msidesc); + ret = irq_set_msi_desc(irq, msidesc); if (ret < 0) goto error_while; write_msi_msg(irq, &msg); @@ -165,7 +165,7 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) goto free; } - ret = set_irq_msi(irq, msidesc); + ret = irq_set_msi_desc(irq, msidesc); if (ret) goto error_while; i++; @@ -210,7 +210,7 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) if (irq < 0) return -1; - ret = set_irq_msi(irq, msidesc); + ret = irq_set_msi_desc(irq, msidesc); if (ret) goto error; } diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index 7b24460917d5..374a05d8ad22 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -131,7 +131,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, unsigned long mmr_offset, int limit) { const struct cpumask *eligible_cpu = cpumask_of(cpu); - struct irq_cfg *cfg = get_irq_chip_data(irq); + struct irq_cfg *cfg = irq_get_chip_data(irq); unsigned long mmr_value; struct uv_IO_APIC_route_entry *entry; int mmr_pnode, err; @@ -148,7 +148,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, else irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, + irq_set_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, irq_name); mmr_value = 0; diff --git a/arch/x86/platform/visws/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c index 632037671746..dcc7aea0ff64 100644 --- a/arch/x86/platform/visws/visws_quirks.c +++ b/arch/x86/platform/visws/visws_quirks.c @@ -606,7 +606,7 @@ static void __init visws_pre_intr_init(void) chip = &cobalt_irq_type; if (chip) - set_irq_chip(i, chip); + irq_set_chip(i, chip); } setup_irq(CO_IRQ_8259, &master_action); From c60eaf25cd211d2282a6edddb3ce26b1e5795097 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 11 Mar 2011 13:17:16 +0100 Subject: [PATCH 623/706] x86: ioapic: Simplify irq chip and handler setup Use pointers instead of ugly multiline if/else constructs. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 48 +++++++++++++++------------------- 1 file changed, 21 insertions(+), 27 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 7a88b04202e2..224edce72b87 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1242,32 +1242,28 @@ static inline int IO_APIC_irq_trigger(int irq) static void ioapic_register_intr(unsigned int irq, unsigned long trigger) { + struct irq_chip *chip = &ioapic_chip; + irq_flow_handler_t hdl; + bool fasteoi; if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) + trigger == IOAPIC_LEVEL) { irq_set_status_flags(irq, IRQ_LEVEL); - else + fasteoi = true; + } else { irq_clear_status_flags(irq, IRQ_LEVEL); + fasteoi = false; + } if (irq_remapped(irq_get_chip_data(irq))) { irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - if (trigger) - irq_set_chip_and_handler_name(irq, &ir_ioapic_chip, - handle_fasteoi_irq, - "fasteoi"); - else - irq_set_chip_and_handler_name(irq, &ir_ioapic_chip, - handle_edge_irq, "edge"); - return; + chip = &ir_ioapic_chip; + fasteoi = trigger != 0; } - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) - irq_set_chip_and_handler_name(irq, &ioapic_chip, - handle_fasteoi_irq, "fasteoi"); - else - irq_set_chip_and_handler_name(irq, &ioapic_chip, - handle_edge_irq, "edge"); + hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq; + irq_set_chip_and_handler_name(irq, chip, hdl, + fasteoi ? "fasteoi" : "edge"); } static int setup_ioapic_entry(int apic_id, int irq, @@ -3264,6 +3260,7 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) { + struct irq_chip *chip = &msi_chip; struct msi_msg msg; int ret; @@ -3276,11 +3273,10 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) if (irq_remapped(irq_get_chip_data(irq))) { irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - irq_set_chip_and_handler_name(irq, &msi_ir_chip, - handle_edge_irq, "edge"); - } else - irq_set_chip_and_handler_name(irq, &msi_chip, - handle_edge_irq, "edge"); + chip = &msi_ir_chip; + } + + irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq); @@ -3457,6 +3453,7 @@ static struct irq_chip hpet_msi_type = { int arch_setup_hpet_msi(unsigned int irq, unsigned int id) { + struct irq_chip *chip = &hpet_msi_type; struct msi_msg msg; int ret; @@ -3479,12 +3476,9 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id) hpet_msi_write(irq_get_handler_data(irq), &msg); irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); if (irq_remapped(irq_get_chip_data(irq))) - irq_set_chip_and_handler_name(irq, &ir_hpet_msi_type, - handle_edge_irq, "edge"); - else - irq_set_chip_and_handler_name(irq, &hpet_msi_type, - handle_edge_irq, "edge"); + chip = &ir_hpet_msi_type; + irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); return 0; } #endif From 5451ddc5621550a2f4f82ddeac938b3ca392525f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 5 Feb 2011 15:35:51 +0100 Subject: [PATCH 624/706] x86: ioapic: Use irq_data->state Use the state information in irq_data. That avoids a radix-tree lookup from apic_ack_level() and simplifies setup_ioapic_dest(). Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 224edce72b87..e481e00a1b65 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2431,7 +2431,7 @@ static void ack_apic_level(struct irq_data *data) irq_complete_move(cfg); #ifdef CONFIG_GENERIC_PENDING_IRQ /* If we are moving the irq we need to mask it */ - if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { + if (unlikely(irqd_is_setaffinity_pending(data))) { do_unmask_irq = 1; mask_ioapic(cfg); } @@ -3822,8 +3822,8 @@ int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity) void __init setup_ioapic_dest(void) { int pin, ioapic, irq, irq_entry; - struct irq_desc *desc; const struct cpumask *mask; + struct irq_data *idata; if (skip_ioapic_setup == 1) return; @@ -3838,21 +3838,20 @@ void __init setup_ioapic_dest(void) if ((ioapic > 0) && (irq > 16)) continue; - desc = irq_to_desc(irq); + idata = irq_get_irq_data(irq); /* * Honour affinities which have been set in early boot */ - if (desc->status & - (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) - mask = desc->irq_data.affinity; + if (!irqd_can_balance(idata) || irqd_affinity_was_set(idata)) + mask = idata->affinity; else mask = apic->target_cpus(); if (intr_remapping_enabled) - ir_ioapic_set_affinity(&desc->irq_data, mask, false); + ir_ioapic_set_affinity(idata, mask, false); else - ioapic_set_affinity(&desc->irq_data, mask, false); + ioapic_set_affinity(idata, mask, false); } } From 51c43ac6e4540786a6d79ea318b30f7bfa615ec7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Feb 2011 21:40:36 +0100 Subject: [PATCH 625/706] x86: Use the proper accessors in fixup_irqs() Signed-off-by: Thomas Gleixner --- arch/x86/kernel/irq.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 78793efd3180..00bf99df583c 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -293,6 +293,7 @@ void fixup_irqs(void) static int warned; struct irq_desc *desc; struct irq_data *data; + struct irq_chip *chip; for_each_irq_desc(irq, desc) { int break_affinity = 0; @@ -307,7 +308,7 @@ void fixup_irqs(void) /* interrupt's are disabled at this point */ raw_spin_lock(&desc->lock); - data = &desc->irq_data; + data = irq_desc_get_irq_data(desc); affinity = data->affinity; if (!irq_has_action(irq) || cpumask_subset(affinity, cpu_online_mask)) { @@ -327,16 +328,17 @@ void fixup_irqs(void) affinity = cpu_all_mask; } - if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_mask) - data->chip->irq_mask(data); + chip = irq_data_get_irq_chip(data); + if (!irqd_can_move_in_process_context(data) && chip->irq_mask) + chip->irq_mask(data); - if (data->chip->irq_set_affinity) - data->chip->irq_set_affinity(data, affinity, true); + if (chip->irq_set_affinity) + chip->irq_set_affinity(data, affinity, true); else if (!(warned++)) set_affinity = 0; - if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_unmask) - data->chip->irq_unmask(data); + if (!irqd_can_move_in_process_context(data) && chip->irq_unmask) + chip->irq_unmask(data); raw_spin_unlock(&desc->lock); @@ -368,10 +370,11 @@ void fixup_irqs(void) irq = __this_cpu_read(vector_irq[vector]); desc = irq_to_desc(irq); - data = &desc->irq_data; + data = irq_desc_get_irq_data(desc); + chip = irq_data_get_irq_chip(data); raw_spin_lock(&desc->lock); - if (data->chip->irq_retrigger) - data->chip->irq_retrigger(data); + if (chip->irq_retrigger) + chip->irq_retrigger(data); raw_spin_unlock(&desc->lock); } } From 08221110e88ae101acf2464154f98e6d1b1ab21c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 4 Feb 2011 18:56:11 +0100 Subject: [PATCH 626/706] x86: ioapic: Use new move_irq functions Use the functions which take irq_data. We already have a pointer to irq_data. That avoids a sparse irq lookup in move_*_irq. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e481e00a1b65..e9d4b963ba0e 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2374,7 +2374,7 @@ static inline void irq_complete_move(struct irq_cfg *cfg) { } static void ack_apic_edge(struct irq_data *data) { irq_complete_move(data->chip_data); - move_native_irq(data->irq); + irq_move_irq(data); ack_APIC_irq(); } @@ -2520,7 +2520,7 @@ static void ack_apic_level(struct irq_data *data) * and you can go talk to the chipset vendor about it. */ if (!io_apic_level_ack_pending(cfg)) - move_masked_irq(irq); + irq_move_masked_irq(data); unmask_ioapic(cfg); } } From 1a0e62a49ad417712cfa79a395f6c39f67aadb44 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 12 Mar 2011 13:47:18 +0100 Subject: [PATCH 627/706] x86: ioapic: Avoid redundant lookup of irq_cfg The caller of ioapic_register_intr() has a pointer to the irq_cfg for the irq already. Hand it in to avoid a full lookup. In msi_compose_msg() the pointer to irq_cfg is already available. No need to look it up again. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e9d4b963ba0e..4b5ebd26f565 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1240,7 +1240,8 @@ static inline int IO_APIC_irq_trigger(int irq) } #endif -static void ioapic_register_intr(unsigned int irq, unsigned long trigger) +static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg, + unsigned long trigger) { struct irq_chip *chip = &ioapic_chip; irq_flow_handler_t hdl; @@ -1255,7 +1256,7 @@ static void ioapic_register_intr(unsigned int irq, unsigned long trigger) fasteoi = false; } - if (irq_remapped(irq_get_chip_data(irq))) { + if (irq_remapped(cfg)) { irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); chip = &ir_ioapic_chip; fasteoi = trigger != 0; @@ -1361,7 +1362,7 @@ static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq, return; } - ioapic_register_intr(irq, trigger); + ioapic_register_intr(irq, cfg, trigger); if (irq < legacy_pic->nr_legacy_irqs) legacy_pic->mask(irq); @@ -3088,7 +3089,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); - if (irq_remapped(irq_get_chip_data(irq))) { + if (irq_remapped(cfg)) { struct irte irte; int ir_index; u16 sub_handle; From 517e49815677b43b26d3167aadca83919ef36a45 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 16 Dec 2010 17:59:57 +0100 Subject: [PATCH 628/706] x86: Use generic show_interrupts Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig | 1 + arch/x86/kernel/irq.c | 57 ++----------------------------------------- 2 files changed, 3 insertions(+), 55 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 867a8cf04faa..ad74608349bd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -66,6 +66,7 @@ config X86 select HAVE_SPARSE_IRQ select GENERIC_IRQ_PROBE select GENERIC_PENDING_IRQ if SMP + select GENERIC_IRQ_SHOW select USE_GENERIC_SMP_HELPERS if SMP config INSTRUCTION_DECODER diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 00bf99df583c..5ee693faa111 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -44,9 +44,9 @@ void ack_bad_irq(unsigned int irq) #define irq_stats(x) (&per_cpu(irq_stat, x)) /* - * /proc/interrupts printing: + * /proc/interrupts printing for arch specific interrupts */ -static int show_other_interrupts(struct seq_file *p, int prec) +int arch_show_interrupts(struct seq_file *p, int prec) { int j; @@ -122,59 +122,6 @@ static int show_other_interrupts(struct seq_file *p, int prec) return 0; } -int show_interrupts(struct seq_file *p, void *v) -{ - unsigned long flags, any_count = 0; - int i = *(loff_t *) v, j, prec; - struct irqaction *action; - struct irq_desc *desc; - - if (i > nr_irqs) - return 0; - - for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec) - j *= 10; - - if (i == nr_irqs) - return show_other_interrupts(p, prec); - - /* print header */ - if (i == 0) { - seq_printf(p, "%*s", prec + 8, ""); - for_each_online_cpu(j) - seq_printf(p, "CPU%-8d", j); - seq_putc(p, '\n'); - } - - desc = irq_to_desc(i); - if (!desc) - return 0; - - raw_spin_lock_irqsave(&desc->lock, flags); - for_each_online_cpu(j) - any_count |= kstat_irqs_cpu(i, j); - action = desc->action; - if (!action && !any_count) - goto out; - - seq_printf(p, "%*d: ", prec, i); - for_each_online_cpu(j) - seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); - seq_printf(p, " %8s", desc->irq_data.chip->name); - seq_printf(p, "-%-8s", desc->name); - - if (action) { - seq_printf(p, " %s", action->name); - while ((action = action->next) != NULL) - seq_printf(p, ", %s", action->name); - } - - seq_putc(p, '\n'); -out: - raw_spin_unlock_irqrestore(&desc->lock, flags); - return 0; -} - /* * /proc/stat helpers */ From 9bbbff25b31bbfdb512563cc5a14bcde5bf29bdf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 27 Jan 2011 18:17:01 +0100 Subject: [PATCH 629/706] x86: Mark low level interrupts IRQF_NO_THREAD These cannot be threaded. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/irqinit.c | 2 ++ arch/x86/platform/visws/visws_quirks.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 3dd751ce6cc2..1cc302d16fb4 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -71,6 +71,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id) static struct irqaction fpu_irq = { .handler = math_error_irq, .name = "fpu", + .flags = IRQF_NO_THREAD, }; #endif @@ -80,6 +81,7 @@ static struct irqaction fpu_irq = { static struct irqaction irq2 = { .handler = no_action, .name = "cascade", + .flags = IRQF_NO_THREAD, }; DEFINE_PER_CPU(vector_irq_t, vector_irq) = { diff --git a/arch/x86/platform/visws/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c index dcc7aea0ff64..fe4cf8294878 100644 --- a/arch/x86/platform/visws/visws_quirks.c +++ b/arch/x86/platform/visws/visws_quirks.c @@ -569,11 +569,13 @@ out_unlock: static struct irqaction master_action = { .handler = piix4_master_intr, .name = "PIIX4-8259", + .flags = IRQF_NO_THREAD, }; static struct irqaction cascade_action = { .handler = no_action, .name = "cascade", + .flags = IRQF_NO_THREAD, }; static inline void set_piix4_virtual_irq_type(void) From c0185808eb85139f45dbfd0de66963c498d0c4db Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 02:24:08 +0100 Subject: [PATCH 630/706] x86: Enable forced interrupt threading support All non threadeable interrupts are marked. Enable forced irq threading support. Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ad74608349bd..e7ef769eabf2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -67,6 +67,7 @@ config X86 select GENERIC_IRQ_PROBE select GENERIC_PENDING_IRQ if SMP select GENERIC_IRQ_SHOW + select IRQ_FORCED_THREADING select USE_GENERIC_SMP_HELPERS if SMP config INSTRUCTION_DECODER From 6e6823d17b157f185be09f4c70181299f9273f0b Mon Sep 17 00:00:00 2001 From: Torben Hohn Date: Thu, 3 Mar 2011 18:26:14 +0100 Subject: [PATCH 631/706] posix-clocks: Check write permissions in posix syscalls pc_clock_settime() and pc_clock_adjtime() do not check whether the fd was opened in write mode, so a clock can be set with a read only fd. [ tglx: We deliberately do not return -EPERM as we want this to be distingushable from the capability based permission check ] Signed-off-by: Torben Hohn LKML-Reference: <1299173174-348-4-git-send-email-torbenh@gmx.de> Cc: Richard Cochran Cc: John Stultz Cc: Thomas Gleixner --- kernel/time/posix-clock.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 04498cbf6002..25028dd4fa18 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -287,11 +287,16 @@ static int pc_clock_adjtime(clockid_t id, struct timex *tx) if (err) return err; + if ((cd.fp->f_mode & FMODE_WRITE) == 0) { + err = -EACCES; + goto out; + } + if (cd.clk->ops.clock_adjtime) err = cd.clk->ops.clock_adjtime(cd.clk, tx); else err = -EOPNOTSUPP; - +out: put_clock_desc(&cd); return err; @@ -344,11 +349,16 @@ static int pc_clock_settime(clockid_t id, const struct timespec *ts) if (err) return err; + if ((cd.fp->f_mode & FMODE_WRITE) == 0) { + err = -EACCES; + goto out; + } + if (cd.clk->ops.clock_settime) err = cd.clk->ops.clock_settime(cd.clk, ts); else err = -EOPNOTSUPP; - +out: put_clock_desc(&cd); return err; From 586ce098a23b6ab7383df853a84ae3d48dc889aa Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 13 Mar 2011 01:50:58 -0500 Subject: [PATCH 632/706] compat breakage in preadv() and pwritev() Fix for a dumb preadv()/pwritev() compat bug - unlike the native variants, compat_... ones forget to check FMODE_P{READ,WRITE}, so e.g. on pipe the native preadv() will fail with -ESPIPE and compat one will act as readv() and succeed. Not critical, but it's a clear bug with trivial fix. Signed-off-by: Al Viro --- fs/compat.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/compat.c b/fs/compat.c index f6fd0a00e6cc..691c3fd8ce1d 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1228,7 +1228,9 @@ compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec, file = fget_light(fd, &fput_needed); if (!file) return -EBADF; - ret = compat_readv(file, vec, vlen, &pos); + ret = -ESPIPE; + if (file->f_mode & FMODE_PREAD) + ret = compat_readv(file, vec, vlen, &pos); fput_light(file, fput_needed); return ret; } @@ -1285,7 +1287,9 @@ compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec, file = fget_light(fd, &fput_needed); if (!file) return -EBADF; - ret = compat_writev(file, vec, vlen, &pos); + ret = -ESPIPE; + if (file->f_mode & FMODE_PWRITE) + ret = compat_writev(file, vec, vlen, &pos); fput_light(file, fput_needed); return ret; } From 15a9155fe3e8215c02b80df51ec2cac7c0d726ad Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 16 Feb 2011 15:08:54 -0500 Subject: [PATCH 633/706] fix race in audit_get_nd() don't rely on pathname resolution ending up twice at the same point... Signed-off-by: Al Viro --- kernel/audit_watch.c | 87 +++++++++++++++++--------------------------- 1 file changed, 33 insertions(+), 54 deletions(-) diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index d2e3c7866460..20b9fe6907d0 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -144,9 +144,9 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev) } /* Initialize a parent watch entry. */ -static struct audit_parent *audit_init_parent(struct nameidata *ndp) +static struct audit_parent *audit_init_parent(struct path *path) { - struct inode *inode = ndp->path.dentry->d_inode; + struct inode *inode = path->dentry->d_inode; struct audit_parent *parent; int ret; @@ -353,53 +353,40 @@ static void audit_remove_parent_watches(struct audit_parent *parent) } /* Get path information necessary for adding watches. */ -static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw) +static int audit_get_nd(struct audit_watch *watch, struct path *parent) { - struct nameidata *ndparent, *ndwatch; + struct nameidata nd; + struct dentry *d; int err; - ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL); - if (unlikely(!ndparent)) - return -ENOMEM; - - ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL); - if (unlikely(!ndwatch)) { - kfree(ndparent); - return -ENOMEM; - } - - err = path_lookup(path, LOOKUP_PARENT, ndparent); - if (err) { - kfree(ndparent); - kfree(ndwatch); + err = path_lookup(watch->path, LOOKUP_PARENT, &nd); + if (err) return err; + + if (nd.last_type != LAST_NORM) { + path_put(&nd.path); + return -EINVAL; } - err = path_lookup(path, 0, ndwatch); - if (err) { - kfree(ndwatch); - ndwatch = NULL; + mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len); + if (IS_ERR(d)) { + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + path_put(&nd.path); + return PTR_ERR(d); } + if (d->d_inode) { + /* update watch filter fields */ + watch->dev = d->d_inode->i_sb->s_dev; + watch->ino = d->d_inode->i_ino; + } + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - *ndp = ndparent; - *ndw = ndwatch; - + *parent = nd.path; + dput(d); return 0; } -/* Release resources used for watch path information. */ -static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) -{ - if (ndp) { - path_put(&ndp->path); - kfree(ndp); - } - if (ndw) { - path_put(&ndw->path); - kfree(ndw); - } -} - /* Associate the given rule with an existing parent. * Caller must hold audit_filter_mutex. */ static void audit_add_to_parent(struct audit_krule *krule, @@ -440,31 +427,24 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) { struct audit_watch *watch = krule->watch; struct audit_parent *parent; - struct nameidata *ndp = NULL, *ndw = NULL; + struct path parent_path; int h, ret = 0; mutex_unlock(&audit_filter_mutex); /* Avoid calling path_lookup under audit_filter_mutex. */ - ret = audit_get_nd(watch->path, &ndp, &ndw); - if (ret) { - /* caller expects mutex locked */ - mutex_lock(&audit_filter_mutex); - goto error; - } + ret = audit_get_nd(watch, &parent_path); + /* caller expects mutex locked */ mutex_lock(&audit_filter_mutex); - /* update watch filter fields */ - if (ndw) { - watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; - watch->ino = ndw->path.dentry->d_inode->i_ino; - } + if (ret) + return ret; /* either find an old parent or attach a new one */ - parent = audit_find_parent(ndp->path.dentry->d_inode); + parent = audit_find_parent(parent_path.dentry->d_inode); if (!parent) { - parent = audit_init_parent(ndp); + parent = audit_init_parent(&parent_path); if (IS_ERR(parent)) { ret = PTR_ERR(parent); goto error; @@ -479,9 +459,8 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) h = audit_hash_ino((u32)watch->ino); *list = &audit_inode_hash[h]; error: - audit_put_nd(ndp, ndw); /* NULL args OK */ + path_put(&parent_path); return ret; - } void audit_remove_watch_rule(struct audit_krule *krule) From c9c6cac0c2bdbda42e7b804838648d0bc60ddb13 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 16 Feb 2011 15:15:47 -0500 Subject: [PATCH 634/706] kill path_lookup() all remaining callers pass LOOKUP_PARENT to it, so flags argument can die; renamed to kern_path_parent() Signed-off-by: Al Viro --- arch/powerpc/platforms/cell/spufs/syscalls.c | 2 +- fs/namei.c | 7 +++---- fs/ocfs2/refcounttree.c | 2 +- include/linux/namei.h | 2 +- kernel/audit_watch.c | 2 +- net/unix/af_unix.c | 2 +- 6 files changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/platforms/cell/spufs/syscalls.c b/arch/powerpc/platforms/cell/spufs/syscalls.c index 187a7d32f86a..a3d2ce54ea2e 100644 --- a/arch/powerpc/platforms/cell/spufs/syscalls.c +++ b/arch/powerpc/platforms/cell/spufs/syscalls.c @@ -70,7 +70,7 @@ static long do_spu_create(const char __user *pathname, unsigned int flags, if (!IS_ERR(tmp)) { struct nameidata nd; - ret = path_lookup(tmp, LOOKUP_PARENT, &nd); + ret = kern_path_parent(tmp, &nd); if (!ret) { nd.flags |= LOOKUP_OPEN | LOOKUP_CREATE; ret = spufs_create(&nd, flags, mode, neighbor); diff --git a/fs/namei.c b/fs/namei.c index a4689eb2df28..1d6bc8151553 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1747,10 +1747,9 @@ static int do_path_lookup(int dfd, const char *name, return retval; } -int path_lookup(const char *name, unsigned int flags, - struct nameidata *nd) +int kern_path_parent(const char *name, struct nameidata *nd) { - return do_path_lookup(AT_FDCWD, name, flags, nd); + return do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, nd); } int kern_path(const char *name, unsigned int flags, struct path *path) @@ -3586,7 +3585,7 @@ EXPORT_SYMBOL(page_readlink); EXPORT_SYMBOL(__page_symlink); EXPORT_SYMBOL(page_symlink); EXPORT_SYMBOL(page_symlink_inode_operations); -EXPORT_SYMBOL(path_lookup); +EXPORT_SYMBOL(kern_path_parent); EXPORT_SYMBOL(kern_path); EXPORT_SYMBOL(vfs_path_lookup); EXPORT_SYMBOL(inode_permission); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 19ebc5aad391..29623da133cc 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4379,7 +4379,7 @@ static int ocfs2_user_path_parent(const char __user *path, if (IS_ERR(s)) return PTR_ERR(s); - error = path_lookup(s, LOOKUP_PARENT, nd); + error = kern_path_parent(s, nd); if (error) putname(s); else diff --git a/include/linux/namei.h b/include/linux/namei.h index f276d4fa01fc..58ce3433d4ec 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -72,7 +72,7 @@ extern int user_path_at(int, const char __user *, unsigned, struct path *); extern int kern_path(const char *, unsigned, struct path *); -extern int path_lookup(const char *, unsigned, struct nameidata *); +extern int kern_path_parent(const char *, struct nameidata *); extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct nameidata *); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 20b9fe6907d0..e683869365d9 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -359,7 +359,7 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) struct dentry *d; int err; - err = path_lookup(watch->path, LOOKUP_PARENT, &nd); + err = kern_path_parent(watch->path, &nd); if (err) return err; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index dd419d286204..d8c04a602cf1 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -850,7 +850,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) * Get the parent directory, calculate the hash for last * component. */ - err = path_lookup(sunaddr->sun_path, LOOKUP_PARENT, &nd); + err = kern_path_parent(sunaddr->sun_path, &nd); if (err) goto out_mknod_parent; From 52094c8a0610cf57920ad4c6c57470ae2ccbbd25 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 21 Feb 2011 21:34:47 -0500 Subject: [PATCH 635/706] take RCU-dependent stuff around exec_permission() into a new helper Signed-off-by: Al Viro --- fs/namei.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 1d6bc8151553..8c704465f6ce 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1322,6 +1322,18 @@ fail: return PTR_ERR(dentry); } +static inline int may_lookup(struct nameidata *nd) +{ + if (nd->flags & LOOKUP_RCU) { + int err = exec_permission(nd->inode, IPERM_FLAG_RCU); + if (err != -ECHILD) + return err; + if (nameidata_drop_rcu(nd)) + return -ECHILD; + } + return exec_permission(nd->inode, 0); +} + /* * Name resolution. * This is the basic name resolution function, turning a pathname into @@ -1352,17 +1364,8 @@ static int link_path_walk(const char *name, struct nameidata *nd) unsigned int c; nd->flags |= LOOKUP_CONTINUE; - if (nd->flags & LOOKUP_RCU) { - err = exec_permission(nd->inode, IPERM_FLAG_RCU); - if (err == -ECHILD) { - if (nameidata_drop_rcu(nd)) - return -ECHILD; - goto exec_again; - } - } else { -exec_again: - err = exec_permission(nd->inode, 0); - } + + err = may_lookup(nd); if (err) break; From ee0827cd6b42b0385dc1a116cd853ac1b739f711 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 21 Feb 2011 23:38:09 -0500 Subject: [PATCH 636/706] sanitize path_walk() mess New helper: path_lookupat(). Basically, what do_path_lookup() boils to modulo -ECHILD/-ESTALE handler. path_walk* family is gone; vfs_path_lookup() is using link_path_walk() directly, do_path_lookup() and do_filp_open() are using path_lookupat(). Signed-off-by: Al Viro --- fs/namei.c | 148 ++++++++++++++++++++--------------------------------- 1 file changed, 56 insertions(+), 92 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 8c704465f6ce..f5de5bb1a61f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1520,59 +1520,6 @@ return_err: return err; } -static inline int path_walk_rcu(const char *name, struct nameidata *nd) -{ - current->total_link_count = 0; - - return link_path_walk(name, nd); -} - -static inline int path_walk_simple(const char *name, struct nameidata *nd) -{ - current->total_link_count = 0; - - return link_path_walk(name, nd); -} - -static int path_walk(const char *name, struct nameidata *nd) -{ - struct path save = nd->path; - int result; - - current->total_link_count = 0; - - /* make sure the stuff we saved doesn't go away */ - path_get(&save); - - result = link_path_walk(name, nd); - if (result == -ESTALE) { - /* nd->path had been dropped */ - current->total_link_count = 0; - nd->path = save; - nd->inode = save.dentry->d_inode; - path_get(&nd->path); - nd->flags |= LOOKUP_REVAL; - result = link_path_walk(name, nd); - } - - path_put(&save); - - return result; -} - -static void path_finish_rcu(struct nameidata *nd) -{ - if (nd->flags & LOOKUP_RCU) { - /* RCU dangling. Cancel it. */ - nd->flags &= ~LOOKUP_RCU; - nd->root.mnt = NULL; - rcu_read_unlock(); - br_read_unlock(vfsmount_lock); - } - if (nd->file) - fput(nd->file); -} - static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd) { int retval = 0; @@ -1697,7 +1644,7 @@ out_fail: } /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ -static int do_path_lookup(int dfd, const char *name, +static int path_lookupat(int dfd, const char *name, unsigned int flags, struct nameidata *nd) { int retval; @@ -1716,29 +1663,45 @@ static int do_path_lookup(int dfd, const char *name, * be handled by restarting a traditional ref-walk (which will always * be able to complete). */ - retval = path_init_rcu(dfd, name, flags, nd); + if (flags & LOOKUP_RCU) + retval = path_init_rcu(dfd, name, flags, nd); + else + retval = path_init(dfd, name, flags, nd); + if (unlikely(retval)) return retval; - retval = path_walk_rcu(name, nd); - path_finish_rcu(nd); + + current->total_link_count = 0; + retval = link_path_walk(name, nd); + + if (nd->flags & LOOKUP_RCU) { + /* RCU dangling. Cancel it. */ + nd->flags &= ~LOOKUP_RCU; + nd->root.mnt = NULL; + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + } + + if (nd->file) { + fput(nd->file); + nd->file = NULL; + } + if (nd->root.mnt) { path_put(&nd->root); nd->root.mnt = NULL; } + return retval; +} - if (unlikely(retval == -ECHILD || retval == -ESTALE)) { - /* slower, locked walk */ - if (retval == -ESTALE) - flags |= LOOKUP_REVAL; - retval = path_init(dfd, name, flags, nd); - if (unlikely(retval)) - return retval; - retval = path_walk(name, nd); - if (nd->root.mnt) { - path_put(&nd->root); - nd->root.mnt = NULL; - } - } +static int do_path_lookup(int dfd, const char *name, + unsigned int flags, struct nameidata *nd) +{ + int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd); + if (unlikely(retval == -ECHILD)) + retval = path_lookupat(dfd, name, flags, nd); + if (unlikely(retval == -ESTALE)) + retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd); if (likely(!retval)) { if (unlikely(!audit_dummy_context())) { @@ -1746,7 +1709,6 @@ static int do_path_lookup(int dfd, const char *name, audit_inode(name, nd->path.dentry); } } - return retval; } @@ -1776,7 +1738,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, const char *name, unsigned int flags, struct nameidata *nd) { - int retval; + int result; /* same as do_path_lookup */ nd->last_type = LAST_ROOT; @@ -1790,15 +1752,27 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, path_get(&nd->root); nd->inode = nd->path.dentry->d_inode; - retval = path_walk(name, nd); - if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && + current->total_link_count = 0; + + result = link_path_walk(name, nd); + if (result == -ESTALE) { + /* nd->path had been dropped */ + current->total_link_count = 0; + nd->path.dentry = dentry; + nd->path.mnt = mnt; + nd->inode = dentry->d_inode; + path_get(&nd->path); + nd->flags |= LOOKUP_REVAL; + result = link_path_walk(name, nd); + } + if (unlikely(!result && !audit_dummy_context() && nd->path.dentry && nd->inode)) audit_inode(name, nd->path.dentry); path_put(&nd->root); nd->root.mnt = NULL; - return retval; + return result; } static struct dentry *__lookup_hash(struct qstr *name, @@ -2483,24 +2457,14 @@ out_filp2: creat: /* OK, have to create the file. Find the parent. */ - error = path_init_rcu(dfd, pathname, - LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd); - if (error) - goto out_filp; - error = path_walk_rcu(pathname, &nd); - path_finish_rcu(&nd); - if (unlikely(error == -ECHILD || error == -ESTALE)) { - /* slower, locked walk */ - if (error == -ESTALE) { + error = path_lookupat(dfd, pathname, LOOKUP_PARENT | LOOKUP_RCU, &nd); + if (unlikely(error == -ECHILD)) + error = path_lookupat(dfd, pathname, LOOKUP_PARENT, &nd); + if (unlikely(error == -ESTALE)) { reval: - flags |= LOOKUP_REVAL; - } - error = path_init(dfd, pathname, - LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd); - if (error) - goto out_filp; - - error = path_walk_simple(pathname, &nd); + flags |= LOOKUP_REVAL; + error = path_lookupat(dfd, pathname, + LOOKUP_PARENT | LOOKUP_REVAL, &nd); } if (unlikely(error)) goto out_filp; From e41f7d4ee5bdb00da7d327a00b0ab9c4a2e9eaa3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 22 Feb 2011 14:02:58 -0500 Subject: [PATCH 637/706] merge path_init and path_init_rcu Actual dependency on whether we want RCU or not is in 3 small areas (as it ought to be) and everything around those is the same in both versions. Since each function has only one caller and those callers are on two sides of if (flags & LOOKUP_RCU), it's easier and cleaner to merge them and pull the checks inside. Signed-off-by: Al Viro --- fs/namei.c | 120 ++++++++++++++++------------------------------------- 1 file changed, 36 insertions(+), 84 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index f5de5bb1a61f..b9e537980ef5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1520,80 +1520,6 @@ return_err: return err; } -static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd) -{ - int retval = 0; - int fput_needed; - struct file *file; - - nd->last_type = LAST_ROOT; /* if there are only slashes... */ - nd->flags = flags | LOOKUP_RCU; - nd->depth = 0; - nd->root.mnt = NULL; - nd->file = NULL; - - if (*name=='/') { - struct fs_struct *fs = current->fs; - unsigned seq; - - br_read_lock(vfsmount_lock); - rcu_read_lock(); - - do { - seq = read_seqcount_begin(&fs->seq); - nd->root = fs->root; - nd->path = nd->root; - nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); - } while (read_seqcount_retry(&fs->seq, seq)); - - } else if (dfd == AT_FDCWD) { - struct fs_struct *fs = current->fs; - unsigned seq; - - br_read_lock(vfsmount_lock); - rcu_read_lock(); - - do { - seq = read_seqcount_begin(&fs->seq); - nd->path = fs->pwd; - nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); - } while (read_seqcount_retry(&fs->seq, seq)); - - } else { - struct dentry *dentry; - - file = fget_light(dfd, &fput_needed); - retval = -EBADF; - if (!file) - goto out_fail; - - dentry = file->f_path.dentry; - - retval = -ENOTDIR; - if (!S_ISDIR(dentry->d_inode->i_mode)) - goto fput_fail; - - retval = file_permission(file, MAY_EXEC); - if (retval) - goto fput_fail; - - nd->path = file->f_path; - if (fput_needed) - nd->file = file; - - nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); - br_read_lock(vfsmount_lock); - rcu_read_lock(); - } - nd->inode = nd->path.dentry->d_inode; - return 0; - -fput_fail: - fput_light(file, fput_needed); -out_fail: - return retval; -} - static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd) { int retval = 0; @@ -1604,13 +1530,34 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei nd->flags = flags; nd->depth = 0; nd->root.mnt = NULL; + nd->file = NULL; if (*name=='/') { - set_root(nd); + if (flags & LOOKUP_RCU) { + br_read_lock(vfsmount_lock); + rcu_read_lock(); + set_root_rcu(nd); + } else { + set_root(nd); + path_get(&nd->root); + } nd->path = nd->root; - path_get(&nd->root); } else if (dfd == AT_FDCWD) { - get_fs_pwd(current->fs, &nd->path); + if (flags & LOOKUP_RCU) { + struct fs_struct *fs = current->fs; + unsigned seq; + + br_read_lock(vfsmount_lock); + rcu_read_lock(); + + do { + seq = read_seqcount_begin(&fs->seq); + nd->path = fs->pwd; + nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + } while (read_seqcount_retry(&fs->seq, seq)); + } else { + get_fs_pwd(current->fs, &nd->path); + } } else { struct dentry *dentry; @@ -1630,10 +1577,18 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei goto fput_fail; nd->path = file->f_path; - path_get(&file->f_path); - - fput_light(file, fput_needed); + if (flags & LOOKUP_RCU) { + if (fput_needed) + nd->file = file; + nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + br_read_lock(vfsmount_lock); + rcu_read_lock(); + } else { + path_get(&file->f_path); + fput_light(file, fput_needed); + } } + nd->inode = nd->path.dentry->d_inode; return 0; @@ -1663,10 +1618,7 @@ static int path_lookupat(int dfd, const char *name, * be handled by restarting a traditional ref-walk (which will always * be able to complete). */ - if (flags & LOOKUP_RCU) - retval = path_init_rcu(dfd, name, flags, nd); - else - retval = path_init(dfd, name, flags, nd); + retval = path_init(dfd, name, flags, nd); if (unlikely(retval)) return retval; From fe479a580dc9c737c4eb49ff7fdb31d41d2c7003 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 22 Feb 2011 15:10:03 -0500 Subject: [PATCH 638/706] merge component type recognition no need to do it in three places... Signed-off-by: Al Viro --- fs/namei.c | 48 ++++++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index b9e537980ef5..4521b5ff7c93 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1362,6 +1362,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) unsigned long hash; struct qstr this; unsigned int c; + int type; nd->flags |= LOOKUP_CONTINUE; @@ -1381,6 +1382,16 @@ static int link_path_walk(const char *name, struct nameidata *nd) this.len = name - (const char *) this.name; this.hash = end_name_hash(hash); + type = LAST_NORM; + if (this.name[0] == '.') switch (this.len) { + case 2: + if (this.name[1] == '.') + type = LAST_DOTDOT; + break; + case 1: + type = LAST_DOT; + } + /* remove trailing slashes? */ if (!c) goto last_component; @@ -1393,21 +1404,17 @@ static int link_path_walk(const char *name, struct nameidata *nd) * to be able to know about the current root directory and * parent relationships. */ - if (this.name[0] == '.') switch (this.len) { - default: - break; - case 2: - if (this.name[1] != '.') - break; + if (unlikely(type != LAST_NORM)) { + if (type == LAST_DOTDOT) { if (nd->flags & LOOKUP_RCU) { if (follow_dotdot_rcu(nd)) return -ECHILD; } else follow_dotdot(nd); - /* fallthrough */ - case 1: - continue; + } + continue; } + /* This does the actual lookups.. */ err = do_lookup(nd, &this, &next, &inode); if (err) @@ -1441,20 +1448,15 @@ last_component: nd->flags &= lookup_flags | ~LOOKUP_CONTINUE; if (lookup_flags & LOOKUP_PARENT) goto lookup_parent; - if (this.name[0] == '.') switch (this.len) { - default: - break; - case 2: - if (this.name[1] != '.') - break; + if (unlikely(type != LAST_NORM)) { + if (type == LAST_DOTDOT) { if (nd->flags & LOOKUP_RCU) { if (follow_dotdot_rcu(nd)) return -ECHILD; } else follow_dotdot(nd); - /* fallthrough */ - case 1: - goto return_reval; + } + goto return_reval; } err = do_lookup(nd, &this, &next, &inode); if (err) @@ -1480,14 +1482,8 @@ last_component: goto return_base; lookup_parent: nd->last = this; - nd->last_type = LAST_NORM; - if (this.name[0] != '.') - goto return_base; - if (this.len == 1) - nd->last_type = LAST_DOT; - else if (this.len == 2 && this.name[1] == '.') - nd->last_type = LAST_DOTDOT; - else + nd->last_type = type; + if (type == LAST_NORM) goto return_base; return_reval: /* From 16c2cd7179881d5dd87779512ca5a0d657c64f62 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 22 Feb 2011 15:50:10 -0500 Subject: [PATCH 639/706] untangle the "need_reval_dot" mess instead of ad-hackery around need_reval_dot(), do the following: set a flag (LOOKUP_JUMPED) in the beginning of path, on absolute symlink traversal, on ".." and on procfs-style symlinks. Clear on normal components, leave unchanged on ".". Non-nested callers of link_path_walk() call handle_reval_path(), which checks that flag is set and that fs does want the final revalidate thing, then does ->d_revalidate(). In link_path_walk() all the return_reval stuff is gone. Signed-off-by: Al Viro --- fs/namei.c | 107 +++++++++++++++++------------------------- include/linux/namei.h | 2 + 2 files changed, 46 insertions(+), 63 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 4521b5ff7c93..450b686e9682 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -613,19 +613,8 @@ do_revalidate_rcu(struct dentry *dentry, struct nameidata *nd) return dentry; } -static inline int need_reval_dot(struct dentry *dentry) -{ - if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE))) - return 0; - - if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT))) - return 0; - - return 1; -} - /* - * force_reval_path - force revalidation of a dentry + * handle_reval_path - force revalidation of a dentry * * In some situations the path walking code will trust dentries without * revalidating them. This causes problems for filesystems that depend on @@ -639,27 +628,28 @@ static inline int need_reval_dot(struct dentry *dentry) * invalidate the dentry. It's up to the caller to handle putting references * to the path if necessary. */ -static int -force_reval_path(struct path *path, struct nameidata *nd) +static inline int handle_reval_path(struct nameidata *nd) { + struct dentry *dentry = nd->path.dentry; int status; - struct dentry *dentry = path->dentry; - /* - * only check on filesystems where it's possible for the dentry to - * become stale. - */ - if (!need_reval_dot(dentry)) + if (likely(!(nd->flags & LOOKUP_JUMPED))) return 0; + if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE))) + return 0; + + if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT))) + return 0; + + /* Note: we do not d_invalidate() */ status = d_revalidate(dentry, nd); if (status > 0) return 0; - if (!status) { - d_invalidate(dentry); + if (!status) status = -ESTALE; - } + return status; } @@ -728,6 +718,7 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l path_put(&nd->path); nd->path = nd->root; path_get(&nd->root); + nd->flags |= LOOKUP_JUMPED; } nd->inode = nd->path.dentry->d_inode; @@ -779,11 +770,8 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p) error = 0; if (s) error = __vfs_follow_link(nd, s); - else if (nd->last_type == LAST_BIND) { - error = force_reval_path(&nd->path, nd); - if (error) - path_put(&nd->path); - } + else if (nd->last_type == LAST_BIND) + nd->flags |= LOOKUP_JUMPED; } return error; } @@ -1351,7 +1339,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) while (*name=='/') name++; if (!*name) - goto return_reval; + goto return_base; if (nd->depth) lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE); @@ -1385,12 +1373,16 @@ static int link_path_walk(const char *name, struct nameidata *nd) type = LAST_NORM; if (this.name[0] == '.') switch (this.len) { case 2: - if (this.name[1] == '.') + if (this.name[1] == '.') { type = LAST_DOTDOT; + nd->flags |= LOOKUP_JUMPED; + } break; case 1: type = LAST_DOT; } + if (likely(type == LAST_NORM)) + nd->flags &= ~LOOKUP_JUMPED; /* remove trailing slashes? */ if (!c) @@ -1456,7 +1448,7 @@ last_component: } else follow_dotdot(nd); } - goto return_reval; + goto return_base; } err = do_lookup(nd, &this, &next, &inode); if (err) @@ -1483,24 +1475,6 @@ last_component: lookup_parent: nd->last = this; nd->last_type = type; - if (type == LAST_NORM) - goto return_base; -return_reval: - /* - * We bypassed the ordinary revalidation routines. - * We may need to check the cached dentry for staleness. - */ - if (need_reval_dot(nd->path.dentry)) { - if (nameidata_drop_rcu_last_maybe(nd)) - return -ECHILD; - /* Note: we do not d_invalidate() */ - err = d_revalidate(nd->path.dentry, nd); - if (!err) - err = -ESTALE; - if (err < 0) - break; - return 0; - } return_base: if (nameidata_drop_rcu_last_maybe(nd)) return -ECHILD; @@ -1523,7 +1497,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei struct file *file; nd->last_type = LAST_ROOT; /* if there are only slashes... */ - nd->flags = flags; + nd->flags = flags | LOOKUP_JUMPED; nd->depth = 0; nd->root.mnt = NULL; nd->file = NULL; @@ -1630,6 +1604,9 @@ static int path_lookupat(int dfd, const char *name, br_read_unlock(vfsmount_lock); } + if (!retval) + retval = handle_reval_path(nd); + if (nd->file) { fput(nd->file); nd->file = NULL; @@ -1690,7 +1667,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, /* same as do_path_lookup */ nd->last_type = LAST_ROOT; - nd->flags = flags; + nd->flags = flags | LOOKUP_JUMPED; nd->depth = 0; nd->path.dentry = dentry; @@ -1703,6 +1680,8 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, current->total_link_count = 0; result = link_path_walk(name, nd); + if (!result) + result = handle_reval_path(nd); if (result == -ESTALE) { /* nd->path had been dropped */ current->total_link_count = 0; @@ -1710,8 +1689,11 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, nd->path.mnt = mnt; nd->inode = dentry->d_inode; path_get(&nd->path); - nd->flags |= LOOKUP_REVAL; + nd->flags = flags | LOOKUP_JUMPED | LOOKUP_REVAL; + result = link_path_walk(name, nd); + if (!result) + result = handle_reval_path(nd); } if (unlikely(!result && !audit_dummy_context() && nd->path.dentry && nd->inode)) @@ -2198,30 +2180,29 @@ static struct file *do_last(struct nameidata *nd, struct path *path, { struct dentry *dir = nd->path.dentry; struct file *filp; - int error = -EISDIR; + int error; switch (nd->last_type) { case LAST_DOTDOT: follow_dotdot(nd); dir = nd->path.dentry; case LAST_DOT: - if (need_reval_dot(dir)) { - int status = d_revalidate(nd->path.dentry, nd); - if (!status) - status = -ESTALE; - if (status < 0) { - error = status; - goto exit; - } - } /* fallthrough */ case LAST_ROOT: + error = handle_reval_path(nd); + if (error) + goto exit; + error = -EISDIR; goto exit; case LAST_BIND: + error = handle_reval_path(nd); + if (error) + goto exit; audit_inode(pathname, dir); goto ok; } + error = -EISDIR; /* trailing slashes? */ if (nd->last.name[nd->last.len]) goto exit; @@ -2422,7 +2403,7 @@ reval: /* * We have the parent and last component. */ - nd.flags = flags; + nd.flags = (nd.flags & ~LOOKUP_PARENT) | flags; filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); while (unlikely(!filp)) { /* trailing symlink */ struct path link = path; diff --git a/include/linux/namei.h b/include/linux/namei.h index 58ce3433d4ec..265378a707bd 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -63,6 +63,8 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; #define LOOKUP_EXCL 0x0400 #define LOOKUP_RENAME_TARGET 0x0800 +#define LOOKUP_JUMPED 0x1000 + extern int user_path_at(int, const char __user *, unsigned, struct path *); #define user_path(name, path) user_path_at(AT_FDCWD, name, LOOKUP_FOLLOW, path) From 086e183a641109033420e0b26ddecb6f4abb4c89 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 22 Feb 2011 20:56:27 -0500 Subject: [PATCH 640/706] pull dropping RCU on success of link_path_walk() into path_lookupat() Signed-off-by: Al Viro --- fs/namei.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 450b686e9682..8f10a9ff9f6b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -539,14 +539,6 @@ err_unlock: return -ECHILD; } -/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */ -static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd) -{ - if (likely(nd->flags & LOOKUP_RCU)) - return nameidata_drop_rcu_last(nd); - return 0; -} - /** * release_open_intent - free up open intent resources * @nd: pointer to nameidata @@ -1339,7 +1331,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) while (*name=='/') name++; if (!*name) - goto return_base; + return 0; if (nd->depth) lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE); @@ -1448,7 +1440,7 @@ last_component: } else follow_dotdot(nd); } - goto return_base; + return 0; } err = do_lookup(nd, &this, &next, &inode); if (err) @@ -1471,13 +1463,10 @@ last_component: if (!nd->inode->i_op->lookup) break; } - goto return_base; + return 0; lookup_parent: nd->last = this; nd->last_type = type; -return_base: - if (nameidata_drop_rcu_last_maybe(nd)) - return -ECHILD; return 0; out_dput: if (!(nd->flags & LOOKUP_RCU)) @@ -1598,10 +1587,15 @@ static int path_lookupat(int dfd, const char *name, if (nd->flags & LOOKUP_RCU) { /* RCU dangling. Cancel it. */ - nd->flags &= ~LOOKUP_RCU; - nd->root.mnt = NULL; - rcu_read_unlock(); - br_read_unlock(vfsmount_lock); + if (!retval) { + if (nameidata_drop_rcu_last(nd)) + retval = -ECHILD; + } else { + nd->flags &= ~LOOKUP_RCU; + nd->root.mnt = NULL; + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + } } if (!retval) From 36f3b4f69070fee7c647bab5dc4408990bb3606c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 22 Feb 2011 21:24:38 -0500 Subject: [PATCH 641/706] pull security_inode_follow_link() into __do_follow_link() Signed-off-by: Al Viro --- fs/namei.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 8f10a9ff9f6b..f956567270bb 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -754,6 +754,13 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p) if (link->mnt == nd->path.mnt) mntget(link->mnt); + error = security_inode_follow_link(link->dentry, nd); + if (error) { + *p = ERR_PTR(error); /* no ->put_link(), please */ + path_put(&nd->path); + return error; + } + nd->last_type = LAST_BIND; *p = dentry->d_inode->i_op->follow_link(dentry, nd); error = PTR_ERR(*p); @@ -791,9 +798,6 @@ static inline int do_follow_link(struct inode *inode, struct path *path, struct goto loop; BUG_ON(nd->depth >= MAX_NESTED_LINKS); cond_resched(); - err = security_inode_follow_link(path->dentry, nd); - if (err) - goto loop; current->link_count++; current->total_link_count++; nd->depth++; @@ -2420,9 +2424,6 @@ reval: * just set LAST_BIND. */ nd.flags |= LOOKUP_PARENT; - error = security_inode_follow_link(link.dentry, &nd); - if (error) - goto exit_dput; error = __do_follow_link(&link, &nd, &cookie); if (unlikely(error)) { if (!IS_ERR(cookie) && linki->i_op->put_link) From f1afe9efc84476ca42fbb7301a441021063eead7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 22 Feb 2011 22:27:28 -0500 Subject: [PATCH 642/706] clean up the failure exits after __do_follow_link() in do_filp_open() Signed-off-by: Al Viro --- fs/namei.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index f956567270bb..e0f59031be87 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2426,15 +2426,12 @@ reval: nd.flags |= LOOKUP_PARENT; error = __do_follow_link(&link, &nd, &cookie); if (unlikely(error)) { - if (!IS_ERR(cookie) && linki->i_op->put_link) - linki->i_op->put_link(link.dentry, &nd, cookie); - /* nd.path had been dropped */ - nd.path = link; - goto out_path; + filp = ERR_PTR(error); + } else { + nd.flags &= ~LOOKUP_PARENT; + filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); } - nd.flags &= ~LOOKUP_PARENT; - filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); - if (linki->i_op->put_link) + if (!IS_ERR(cookie) && linki->i_op->put_link) linki->i_op->put_link(link.dentry, &nd, cookie); path_put(&link); } From c3e380b0b3cfa613189fb91513efd88a65e1d9d8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 23 Feb 2011 13:39:45 -0500 Subject: [PATCH 643/706] Collect "operation mode" arguments of do_last() into a structure No point messing with passing shitloads of "operation mode" arguments to do_open() one by one, especially since they are not going to change during do_filp_open(). Collect them into a struct, fill it and pass to do_last() by reference. Make sure that lookup intent flags are correctly set and removed - we want them for do_last(), but they make no sense for __do_follow_link(). Signed-off-by: Al Viro --- fs/namei.c | 57 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 22 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index e0f59031be87..5e4206f45371 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2169,17 +2169,26 @@ exit: return ERR_PTR(error); } +struct open_flags { + int open_flag; + int mode; + int acc_mode; + int intent; +}; + /* * Handle O_CREAT case for do_filp_open */ static struct file *do_last(struct nameidata *nd, struct path *path, - int open_flag, int acc_mode, - int mode, const char *pathname) + const struct open_flags *op, const char *pathname) { struct dentry *dir = nd->path.dentry; struct file *filp; int error; + nd->flags &= ~LOOKUP_PARENT; + nd->flags |= op->intent; + switch (nd->last_type) { case LAST_DOTDOT: follow_dotdot(nd); @@ -2233,7 +2242,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, error = mnt_want_write(nd->path.mnt); if (error) goto exit_mutex_unlock; - error = __open_namei_create(nd, path, open_flag, mode); + error = __open_namei_create(nd, path, op->open_flag, op->mode); if (error) { mnt_drop_write(nd->path.mnt); goto exit; @@ -2242,7 +2251,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, mnt_drop_write(nd->path.mnt); path_put(&nd->path); if (!IS_ERR(filp)) { - error = ima_file_check(filp, acc_mode); + error = ima_file_check(filp, op->acc_mode); if (error) { fput(filp); filp = ERR_PTR(error); @@ -2258,7 +2267,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, audit_inode(pathname, path->dentry); error = -EEXIST; - if (open_flag & O_EXCL) + if (op->open_flag & O_EXCL) goto exit_dput; error = follow_managed(path, nd->flags); @@ -2278,7 +2287,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, if (S_ISDIR(nd->inode->i_mode)) goto exit; ok: - filp = finish_open(nd, open_flag, acc_mode); + filp = finish_open(nd, op->open_flag, op->acc_mode); return filp; exit_mutex_unlock: @@ -2304,7 +2313,8 @@ struct file *do_filp_open(int dfd, const char *pathname, struct path path; int count = 0; int flag = open_to_namei_flags(open_flag); - int flags; + int flags = 0; + struct open_flags op; if (!(open_flag & O_CREAT)) mode = 0; @@ -2321,6 +2331,8 @@ struct file *do_filp_open(int dfd, const char *pathname, if (open_flag & __O_SYNC) open_flag |= O_DSYNC; + op.open_flag = open_flag; + if (!acc_mode) acc_mode = MAY_OPEN | ACC_MODE(open_flag); @@ -2333,12 +2345,15 @@ struct file *do_filp_open(int dfd, const char *pathname, if (open_flag & O_APPEND) acc_mode |= MAY_APPEND; - flags = LOOKUP_OPEN; + op.acc_mode = acc_mode; + + op.intent = LOOKUP_OPEN; if (open_flag & O_CREAT) { - flags |= LOOKUP_CREATE; + op.intent |= LOOKUP_CREATE; if (open_flag & O_EXCL) - flags |= LOOKUP_EXCL; + op.intent |= LOOKUP_EXCL; } + if (open_flag & O_DIRECTORY) flags |= LOOKUP_DIRECTORY; if (!(open_flag & O_NOFOLLOW)) @@ -2357,7 +2372,7 @@ struct file *do_filp_open(int dfd, const char *pathname, goto creat; /* !O_CREAT, simple open */ - error = do_path_lookup(dfd, pathname, flags, &nd); + error = do_path_lookup(dfd, pathname, flags | op.intent, &nd); if (unlikely(error)) goto out_filp2; error = -ELOOP; @@ -2384,14 +2399,14 @@ out_filp2: creat: /* OK, have to create the file. Find the parent. */ - error = path_lookupat(dfd, pathname, LOOKUP_PARENT | LOOKUP_RCU, &nd); + error = path_lookupat(dfd, pathname, + LOOKUP_PARENT | LOOKUP_RCU | flags, &nd); if (unlikely(error == -ECHILD)) - error = path_lookupat(dfd, pathname, LOOKUP_PARENT, &nd); + error = path_lookupat(dfd, pathname, LOOKUP_PARENT | flags, &nd); if (unlikely(error == -ESTALE)) { reval: flags |= LOOKUP_REVAL; - error = path_lookupat(dfd, pathname, - LOOKUP_PARENT | LOOKUP_REVAL, &nd); + error = path_lookupat(dfd, pathname, LOOKUP_PARENT | flags, &nd); } if (unlikely(error)) goto out_filp; @@ -2401,8 +2416,7 @@ reval: /* * We have the parent and last component. */ - nd.flags = (nd.flags & ~LOOKUP_PARENT) | flags; - filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); + filp = do_last(&nd, &path, &op, pathname); while (unlikely(!filp)) { /* trailing symlink */ struct path link = path; struct inode *linki = link.dentry->d_inode; @@ -2424,13 +2438,12 @@ reval: * just set LAST_BIND. */ nd.flags |= LOOKUP_PARENT; + nd.flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); error = __do_follow_link(&link, &nd, &cookie); - if (unlikely(error)) { + if (unlikely(error)) filp = ERR_PTR(error); - } else { - nd.flags &= ~LOOKUP_PARENT; - filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); - } + else + filp = do_last(&nd, &path, &op, pathname); if (!IS_ERR(cookie) && linki->i_op->put_link) linki->i_op->put_link(link.dentry, &nd, cookie); path_put(&link); From 47c805dc2d2dff686962f5f0baa6bac2d703ba19 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 23 Feb 2011 17:44:09 -0500 Subject: [PATCH 644/706] switch do_filp_open() to struct open_flags take calculation of open_flags by open(2) arguments into new helper in fs/open.c, move filp_open() over there, have it and do_sys_open() use that helper, switch exec.c callers of do_filp_open() to explicit (and constant) struct open_flags. Signed-off-by: Al Viro --- fs/exec.c | 18 ++++++---- fs/internal.h | 8 +++++ fs/namei.c | 88 +++++----------------------------------------- fs/open.c | 73 +++++++++++++++++++++++++++++++++++++- include/linux/fs.h | 2 -- 5 files changed, 101 insertions(+), 88 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 52a447d9b6ab..ba99e1abb1aa 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -115,13 +115,16 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) struct file *file; char *tmp = getname(library); int error = PTR_ERR(tmp); + static const struct open_flags uselib_flags = { + .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, + .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN, + .intent = LOOKUP_OPEN + }; if (IS_ERR(tmp)) goto out; - file = do_filp_open(AT_FDCWD, tmp, - O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 0, - MAY_READ | MAY_EXEC | MAY_OPEN); + file = do_filp_open(AT_FDCWD, tmp, &uselib_flags, LOOKUP_FOLLOW); putname(tmp); error = PTR_ERR(file); if (IS_ERR(file)) @@ -721,10 +724,13 @@ struct file *open_exec(const char *name) { struct file *file; int err; + static const struct open_flags open_exec_flags = { + .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, + .acc_mode = MAY_EXEC | MAY_OPEN, + .intent = LOOKUP_OPEN + }; - file = do_filp_open(AT_FDCWD, name, - O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 0, - MAY_EXEC | MAY_OPEN); + file = do_filp_open(AT_FDCWD, name, &open_exec_flags, LOOKUP_FOLLOW); if (IS_ERR(file)) goto out; diff --git a/fs/internal.h b/fs/internal.h index 9b976b57d7fe..6fdbdf2c6047 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -106,6 +106,14 @@ extern void put_super(struct super_block *sb); struct nameidata; extern struct file *nameidata_to_filp(struct nameidata *); extern void release_open_intent(struct nameidata *); +struct open_flags { + int open_flag; + int mode; + int acc_mode; + int intent; +}; +extern struct file *do_filp_open(int dfd, const char *pathname, + const struct open_flags *op, int lookup_flags); /* * inode.c diff --git a/fs/namei.c b/fs/namei.c index 5e4206f45371..9c7fa946abe1 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2169,13 +2169,6 @@ exit: return ERR_PTR(error); } -struct open_flags { - int open_flag; - int mode; - int acc_mode; - int intent; -}; - /* * Handle O_CREAT case for do_filp_open */ @@ -2305,74 +2298,28 @@ exit: * open_to_namei_flags() for more details. */ struct file *do_filp_open(int dfd, const char *pathname, - int open_flag, int mode, int acc_mode) + const struct open_flags *op, int flags) { struct file *filp; struct nameidata nd; int error; struct path path; int count = 0; - int flag = open_to_namei_flags(open_flag); - int flags = 0; - struct open_flags op; - - if (!(open_flag & O_CREAT)) - mode = 0; - - /* Must never be set by userspace */ - open_flag &= ~FMODE_NONOTIFY; - - /* - * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only - * check for O_DSYNC if the need any syncing at all we enforce it's - * always set instead of having to deal with possibly weird behaviour - * for malicious applications setting only __O_SYNC. - */ - if (open_flag & __O_SYNC) - open_flag |= O_DSYNC; - - op.open_flag = open_flag; - - if (!acc_mode) - acc_mode = MAY_OPEN | ACC_MODE(open_flag); - - /* O_TRUNC implies we need access checks for write permissions */ - if (open_flag & O_TRUNC) - acc_mode |= MAY_WRITE; - - /* Allow the LSM permission hook to distinguish append - access from general write access. */ - if (open_flag & O_APPEND) - acc_mode |= MAY_APPEND; - - op.acc_mode = acc_mode; - - op.intent = LOOKUP_OPEN; - if (open_flag & O_CREAT) { - op.intent |= LOOKUP_CREATE; - if (open_flag & O_EXCL) - op.intent |= LOOKUP_EXCL; - } - - if (open_flag & O_DIRECTORY) - flags |= LOOKUP_DIRECTORY; - if (!(open_flag & O_NOFOLLOW)) - flags |= LOOKUP_FOLLOW; filp = get_empty_filp(); if (!filp) return ERR_PTR(-ENFILE); - filp->f_flags = open_flag; + filp->f_flags = op->open_flag; nd.intent.open.file = filp; - nd.intent.open.flags = flag; - nd.intent.open.create_mode = mode; + nd.intent.open.flags = open_to_namei_flags(op->open_flag); + nd.intent.open.create_mode = op->mode; - if (open_flag & O_CREAT) + if (op->open_flag & O_CREAT) goto creat; /* !O_CREAT, simple open */ - error = do_path_lookup(dfd, pathname, flags | op.intent, &nd); + error = do_path_lookup(dfd, pathname, flags | op->intent, &nd); if (unlikely(error)) goto out_filp2; error = -ELOOP; @@ -2386,7 +2333,7 @@ struct file *do_filp_open(int dfd, const char *pathname, goto out_path2; } audit_inode(pathname, nd.path.dentry); - filp = finish_open(&nd, open_flag, acc_mode); + filp = finish_open(&nd, op->open_flag, op->acc_mode); out2: release_open_intent(&nd); return filp; @@ -2416,7 +2363,7 @@ reval: /* * We have the parent and last component. */ - filp = do_last(&nd, &path, &op, pathname); + filp = do_last(&nd, &path, op, pathname); while (unlikely(!filp)) { /* trailing symlink */ struct path link = path; struct inode *linki = link.dentry->d_inode; @@ -2443,7 +2390,7 @@ reval: if (unlikely(error)) filp = ERR_PTR(error); else - filp = do_last(&nd, &path, &op, pathname); + filp = do_last(&nd, &path, op, pathname); if (!IS_ERR(cookie) && linki->i_op->put_link) linki->i_op->put_link(link.dentry, &nd, cookie); path_put(&link); @@ -2465,23 +2412,6 @@ out_filp: goto out; } -/** - * filp_open - open file and return file pointer - * - * @filename: path to open - * @flags: open flags as per the open(2) second argument - * @mode: mode for the new file if O_CREAT is set, else ignored - * - * This is the helper to open a file from kernelspace if you really - * have to. But in generally you should not do this, so please move - * along, nothing to see here.. - */ -struct file *filp_open(const char *filename, int flags, int mode) -{ - return do_filp_open(AT_FDCWD, filename, flags, mode, 0); -} -EXPORT_SYMBOL(filp_open); - /** * lookup_create - lookup a dentry, creating it if it doesn't exist * @nd: nameidata info diff --git a/fs/open.c b/fs/open.c index b47aab39c057..d05e18c60bae 100644 --- a/fs/open.c +++ b/fs/open.c @@ -890,15 +890,86 @@ void fd_install(unsigned int fd, struct file *file) EXPORT_SYMBOL(fd_install); +static inline int build_open_flags(int flags, int mode, struct open_flags *op) +{ + int lookup_flags = 0; + int acc_mode; + + if (!(flags & O_CREAT)) + mode = 0; + op->mode = mode; + + /* Must never be set by userspace */ + flags &= ~FMODE_NONOTIFY; + + /* + * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only + * check for O_DSYNC if the need any syncing at all we enforce it's + * always set instead of having to deal with possibly weird behaviour + * for malicious applications setting only __O_SYNC. + */ + if (flags & __O_SYNC) + flags |= O_DSYNC; + + op->open_flag = flags; + + acc_mode = MAY_OPEN | ACC_MODE(flags); + + /* O_TRUNC implies we need access checks for write permissions */ + if (flags & O_TRUNC) + acc_mode |= MAY_WRITE; + + /* Allow the LSM permission hook to distinguish append + access from general write access. */ + if (flags & O_APPEND) + acc_mode |= MAY_APPEND; + + op->acc_mode = acc_mode; + + op->intent = LOOKUP_OPEN; + if (flags & O_CREAT) { + op->intent |= LOOKUP_CREATE; + if (flags & O_EXCL) + op->intent |= LOOKUP_EXCL; + } + + if (flags & O_DIRECTORY) + lookup_flags |= LOOKUP_DIRECTORY; + if (!(flags & O_NOFOLLOW)) + lookup_flags |= LOOKUP_FOLLOW; + return lookup_flags; +} + +/** + * filp_open - open file and return file pointer + * + * @filename: path to open + * @flags: open flags as per the open(2) second argument + * @mode: mode for the new file if O_CREAT is set, else ignored + * + * This is the helper to open a file from kernelspace if you really + * have to. But in generally you should not do this, so please move + * along, nothing to see here.. + */ +struct file *filp_open(const char *filename, int flags, int mode) +{ + struct open_flags op; + int lookup = build_open_flags(flags, mode, &op); + return do_filp_open(AT_FDCWD, filename, &op, lookup); +} +EXPORT_SYMBOL(filp_open); + long do_sys_open(int dfd, const char __user *filename, int flags, int mode) { + struct open_flags op; + int lookup = build_open_flags(flags, mode, &op); char *tmp = getname(filename); int fd = PTR_ERR(tmp); if (!IS_ERR(tmp)) { fd = get_unused_fd_flags(flags); if (fd >= 0) { - struct file *f = do_filp_open(dfd, tmp, flags, mode, 0); + struct file *f = do_filp_open(dfd, tmp, &op, lookup); if (IS_ERR(f)) { put_unused_fd(fd); fd = PTR_ERR(f); diff --git a/include/linux/fs.h b/include/linux/fs.h index e38b50a4b9d2..9c75714f92c1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2205,8 +2205,6 @@ extern struct file *create_read_pipe(struct file *f, int flags); extern struct file *create_write_pipe(int flags); extern void free_write_pipe(struct file *); -extern struct file *do_filp_open(int dfd, const char *pathname, - int open_flag, int mode, int acc_mode); extern int may_open(struct path *, int, int); extern int kernel_read(struct file *, loff_t, char *, unsigned long); From 13aab428a73d3200b9283b61b7fdf5713181ac66 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 23 Feb 2011 17:54:08 -0500 Subject: [PATCH 645/706] separate -ESTALE/-ECHILD retries in do_filp_open() from real work new helper: path_openat(). Does what do_filp_open() does, except that it tries only the walk mode (RCU/normal/force revalidation) it had been told to. Both create and non-create branches are using path_lookupat() now. Fixed the double audit_inode() in non-create branch. Signed-off-by: Al Viro --- fs/namei.c | 49 ++++++++++++++++++++----------------------------- 1 file changed, 20 insertions(+), 29 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 9c7fa946abe1..01a17dd2f151 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2292,19 +2292,14 @@ exit: return ERR_PTR(error); } -/* - * Note that the low bits of the passed in "open_flag" - * are not the same as in the local variable "flag". See - * open_to_namei_flags() for more details. - */ -struct file *do_filp_open(int dfd, const char *pathname, +static struct file *path_openat(int dfd, const char *pathname, const struct open_flags *op, int flags) { struct file *filp; struct nameidata nd; - int error; struct path path; int count = 0; + int error; filp = get_empty_filp(); if (!filp) @@ -2319,42 +2314,27 @@ struct file *do_filp_open(int dfd, const char *pathname, goto creat; /* !O_CREAT, simple open */ - error = do_path_lookup(dfd, pathname, flags | op->intent, &nd); + error = path_lookupat(dfd, pathname, flags | op->intent, &nd); if (unlikely(error)) - goto out_filp2; + goto out_filp; error = -ELOOP; if (!(nd.flags & LOOKUP_FOLLOW)) { if (nd.inode->i_op->follow_link) - goto out_path2; + goto out_path; } error = -ENOTDIR; if (nd.flags & LOOKUP_DIRECTORY) { if (!nd.inode->i_op->lookup) - goto out_path2; + goto out_path; } audit_inode(pathname, nd.path.dentry); filp = finish_open(&nd, op->open_flag, op->acc_mode); -out2: release_open_intent(&nd); return filp; -out_path2: - path_put(&nd.path); -out_filp2: - filp = ERR_PTR(error); - goto out2; - creat: /* OK, have to create the file. Find the parent. */ - error = path_lookupat(dfd, pathname, - LOOKUP_PARENT | LOOKUP_RCU | flags, &nd); - if (unlikely(error == -ECHILD)) - error = path_lookupat(dfd, pathname, LOOKUP_PARENT | flags, &nd); - if (unlikely(error == -ESTALE)) { -reval: - flags |= LOOKUP_REVAL; - error = path_lookupat(dfd, pathname, LOOKUP_PARENT | flags, &nd); - } + error = path_lookupat(dfd, pathname, LOOKUP_PARENT | flags, &nd); if (unlikely(error)) goto out_filp; if (unlikely(!audit_dummy_context())) @@ -2398,8 +2378,6 @@ reval: out: if (nd.root.mnt) path_put(&nd.root); - if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL)) - goto reval; release_open_intent(&nd); return filp; @@ -2412,6 +2390,19 @@ out_filp: goto out; } +struct file *do_filp_open(int dfd, const char *pathname, + const struct open_flags *op, int flags) +{ + struct file *filp; + + filp = path_openat(dfd, pathname, op, flags | LOOKUP_RCU); + if (unlikely(filp == ERR_PTR(-ECHILD))) + filp = path_openat(dfd, pathname, op, flags); + if (unlikely(filp == ERR_PTR(-ESTALE))) + filp = path_openat(dfd, pathname, op, flags | LOOKUP_REVAL); + return filp; +} + /** * lookup_create - lookup a dentry, creating it if it doesn't exist * @nd: nameidata info From 7bc055d1d524f209bf49d8b9cb220712dd7df4ed Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 23 Feb 2011 19:41:31 -0500 Subject: [PATCH 646/706] kill out_dput: in link_path_walk() Signed-off-by: Al Viro --- fs/namei.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 01a17dd2f151..fea36369dc87 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1407,22 +1407,19 @@ static int link_path_walk(const char *name, struct nameidata *nd) err = do_lookup(nd, &this, &next, &inode); if (err) break; - err = -ENOENT; - if (!inode) - goto out_dput; - if (inode->i_op->follow_link) { + if (inode && inode->i_op->follow_link) { err = do_follow_link(inode, &next, nd); if (err) goto return_err; nd->inode = nd->path.dentry->d_inode; - err = -ENOENT; - if (!nd->inode) - break; } else { path_to_nameidata(&next, nd); nd->inode = inode; } + err = -ENOENT; + if (!nd->inode) + break; err = -ENOTDIR; if (!nd->inode->i_op->lookup) break; @@ -1472,10 +1469,6 @@ lookup_parent: nd->last = this; nd->last_type = type; return 0; -out_dput: - if (!(nd->flags & LOOKUP_RCU)) - path_put_conditional(&next, nd); - break; } if (!(nd->flags & LOOKUP_RCU)) path_put(&nd->path); From 9856fa1b281eccdc9f8d94d716e96818c675e78e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 4 Mar 2011 14:22:06 -0500 Subject: [PATCH 647/706] pull handling of . and .. into inlined helper getting LOOKUP_RCU checks out of link_path_walk()... Signed-off-by: Al Viro --- fs/namei.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index fea36369dc87..d29f91e8ff3d 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1318,6 +1318,18 @@ static inline int may_lookup(struct nameidata *nd) return exec_permission(nd->inode, 0); } +static inline int handle_dots(struct nameidata *nd, int type) +{ + if (type == LAST_DOTDOT) { + if (nd->flags & LOOKUP_RCU) { + if (follow_dotdot_rcu(nd)) + return -ECHILD; + } else + follow_dotdot(nd); + } + return 0; +} + /* * Name resolution. * This is the basic name resolution function, turning a pathname into @@ -1393,13 +1405,8 @@ static int link_path_walk(const char *name, struct nameidata *nd) * parent relationships. */ if (unlikely(type != LAST_NORM)) { - if (type == LAST_DOTDOT) { - if (nd->flags & LOOKUP_RCU) { - if (follow_dotdot_rcu(nd)) - return -ECHILD; - } else - follow_dotdot(nd); - } + if (handle_dots(nd, type)) + return -ECHILD; continue; } @@ -1434,13 +1441,8 @@ last_component: if (lookup_flags & LOOKUP_PARENT) goto lookup_parent; if (unlikely(type != LAST_NORM)) { - if (type == LAST_DOTDOT) { - if (nd->flags & LOOKUP_RCU) { - if (follow_dotdot_rcu(nd)) - return -ECHILD; - } else - follow_dotdot(nd); - } + if (handle_dots(nd, type)) + return -ECHILD; return 0; } err = do_lookup(nd, &this, &next, &inode); From 4455ca6223cc59cbc0a75f4be8bce9e84cc0d6b8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 4 Mar 2011 14:28:10 -0500 Subject: [PATCH 648/706] clear RCU on all failure exits from link_path_walk() Signed-off-by: Al Viro --- fs/namei.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index d29f91e8ff3d..f09887a45831 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1405,8 +1405,9 @@ static int link_path_walk(const char *name, struct nameidata *nd) * parent relationships. */ if (unlikely(type != LAST_NORM)) { - if (handle_dots(nd, type)) - return -ECHILD; + err = handle_dots(nd, type); + if (err) + goto return_err; continue; } @@ -1441,8 +1442,9 @@ last_component: if (lookup_flags & LOOKUP_PARENT) goto lookup_parent; if (unlikely(type != LAST_NORM)) { - if (handle_dots(nd, type)) - return -ECHILD; + err = handle_dots(nd, type); + if (err) + goto return_err; return 0; } err = do_lookup(nd, &this, &next, &inode); @@ -1475,6 +1477,12 @@ lookup_parent: if (!(nd->flags & LOOKUP_RCU)) path_put(&nd->path); return_err: + if (nd->flags & LOOKUP_RCU) { + nd->flags &= ~LOOKUP_RCU; + nd->root.mnt = NULL; + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + } return err; } @@ -1585,16 +1593,10 @@ static int path_lookupat(int dfd, const char *name, retval = link_path_walk(name, nd); if (nd->flags & LOOKUP_RCU) { - /* RCU dangling. Cancel it. */ - if (!retval) { - if (nameidata_drop_rcu_last(nd)) - retval = -ECHILD; - } else { - nd->flags &= ~LOOKUP_RCU; - nd->root.mnt = NULL; - rcu_read_unlock(); - br_read_unlock(vfsmount_lock); - } + /* went all way through without dropping RCU */ + BUG_ON(retval); + if (nameidata_drop_rcu_last(nd)) + retval = -ECHILD; } if (!retval) From ef7562d5283a91da3ba5c14de3221f47b7f08823 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 4 Mar 2011 14:35:59 -0500 Subject: [PATCH 649/706] make handle_dots() leave RCU mode on error Signed-off-by: Al Viro --- fs/namei.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index f09887a45831..ea14bfb04785 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1052,7 +1052,7 @@ static int follow_dotdot_rcu(struct nameidata *nd) seq = read_seqcount_begin(&parent->d_seq); if (read_seqcount_retry(&old->d_seq, nd->seq)) - return -ECHILD; + goto failed; inode = parent->d_inode; nd->path.dentry = parent; nd->seq = seq; @@ -1065,8 +1065,14 @@ static int follow_dotdot_rcu(struct nameidata *nd) } __follow_mount_rcu(nd, &nd->path, &inode, true); nd->inode = inode; - return 0; + +failed: + nd->flags &= ~LOOKUP_RCU; + nd->root.mnt = NULL; + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + return -ECHILD; } /* @@ -1405,9 +1411,8 @@ static int link_path_walk(const char *name, struct nameidata *nd) * parent relationships. */ if (unlikely(type != LAST_NORM)) { - err = handle_dots(nd, type); - if (err) - goto return_err; + if (handle_dots(nd, type)) + return -ECHILD; continue; } @@ -1441,12 +1446,8 @@ last_component: nd->flags &= lookup_flags | ~LOOKUP_CONTINUE; if (lookup_flags & LOOKUP_PARENT) goto lookup_parent; - if (unlikely(type != LAST_NORM)) { - err = handle_dots(nd, type); - if (err) - goto return_err; - return 0; - } + if (unlikely(type != LAST_NORM)) + return handle_dots(nd, type); err = do_lookup(nd, &this, &next, &inode); if (err) break; From a7472baba22dd5d68580f528374f93421b33667e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 4 Mar 2011 14:39:30 -0500 Subject: [PATCH 650/706] make nameidata_dentry_drop_rcu_maybe() always leave RCU mode Now we have do_follow_link() guaranteed to leave without dangling RCU and the next step will get LOOKUP_RCU logics completely out of link_path_walk(). Signed-off-by: Al Viro --- fs/namei.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index ea14bfb04785..53bba7c1a520 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -498,8 +498,15 @@ err_root: /* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */ static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry) { - if (nd->flags & LOOKUP_RCU) - return nameidata_dentry_drop_rcu(nd, dentry); + if (nd->flags & LOOKUP_RCU) { + if (unlikely(nameidata_dentry_drop_rcu(nd, dentry))) { + nd->flags &= ~LOOKUP_RCU; + nd->root.mnt = NULL; + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + return -ECHILD; + } + } return 0; } @@ -1424,7 +1431,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) if (inode && inode->i_op->follow_link) { err = do_follow_link(inode, &next, nd); if (err) - goto return_err; + return err; nd->inode = nd->path.dentry->d_inode; } else { path_to_nameidata(&next, nd); @@ -1455,7 +1462,7 @@ last_component: (lookup_flags & LOOKUP_FOLLOW)) { err = do_follow_link(inode, &next, nd); if (err) - goto return_err; + return err; nd->inode = nd->path.dentry->d_inode; } else { path_to_nameidata(&next, nd); @@ -1477,7 +1484,6 @@ lookup_parent: } if (!(nd->flags & LOOKUP_RCU)) path_put(&nd->path); -return_err: if (nd->flags & LOOKUP_RCU) { nd->flags &= ~LOOKUP_RCU; nd->root.mnt = NULL; From 951361f954596bd134d4270df834f47d151f98a6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 4 Mar 2011 14:44:37 -0500 Subject: [PATCH 651/706] get rid of the last LOOKUP_RCU dependencies in link_path_walk() New helper: terminate_walk(). An error has happened during pathname resolution and we either drop nd->path or terminate RCU, depending the mode we had been in. After that, nd is essentially empty. Switch link_path_walk() to using that for cleanup. Now the top-level logics in link_path_walk() is back to sanity. RCU dependencies are in the lower-level functions. Signed-off-by: Al Viro --- fs/namei.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 53bba7c1a520..85f6e39b4034 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1343,6 +1343,18 @@ static inline int handle_dots(struct nameidata *nd, int type) return 0; } +static void terminate_walk(struct nameidata *nd) +{ + if (!(nd->flags & LOOKUP_RCU)) { + path_put(&nd->path); + } else { + nd->flags &= ~LOOKUP_RCU; + nd->root.mnt = NULL; + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + } +} + /* * Name resolution. * This is the basic name resolution function, turning a pathname into @@ -1482,14 +1494,7 @@ lookup_parent: nd->last_type = type; return 0; } - if (!(nd->flags & LOOKUP_RCU)) - path_put(&nd->path); - if (nd->flags & LOOKUP_RCU) { - nd->flags &= ~LOOKUP_RCU; - nd->root.mnt = NULL; - rcu_read_unlock(); - br_read_unlock(vfsmount_lock); - } + terminate_walk(nd); return err; } From 70e9b3571107b88674cd55ae4bed33f76261e7d3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 5 Mar 2011 21:12:22 -0500 Subject: [PATCH 652/706] get rid of nd->file Don't stash the struct file * used as starting point of walk in nameidata; pass file ** to path_init() instead. Signed-off-by: Al Viro --- fs/namei.c | 15 +++++++-------- include/linux/namei.h | 1 - 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 85f6e39b4034..a260a306daf5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1498,7 +1498,8 @@ lookup_parent: return err; } -static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd) +static int path_init(int dfd, const char *name, unsigned int flags, + struct nameidata *nd, struct file **fp) { int retval = 0; int fput_needed; @@ -1508,7 +1509,6 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei nd->flags = flags | LOOKUP_JUMPED; nd->depth = 0; nd->root.mnt = NULL; - nd->file = NULL; if (*name=='/') { if (flags & LOOKUP_RCU) { @@ -1557,7 +1557,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei nd->path = file->f_path; if (flags & LOOKUP_RCU) { if (fput_needed) - nd->file = file; + *fp = file; nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); br_read_lock(vfsmount_lock); rcu_read_lock(); @@ -1580,6 +1580,7 @@ out_fail: static int path_lookupat(int dfd, const char *name, unsigned int flags, struct nameidata *nd) { + struct file *base = NULL; int retval; /* @@ -1596,7 +1597,7 @@ static int path_lookupat(int dfd, const char *name, * be handled by restarting a traditional ref-walk (which will always * be able to complete). */ - retval = path_init(dfd, name, flags, nd); + retval = path_init(dfd, name, flags, nd, &base); if (unlikely(retval)) return retval; @@ -1614,10 +1615,8 @@ static int path_lookupat(int dfd, const char *name, if (!retval) retval = handle_reval_path(nd); - if (nd->file) { - fput(nd->file); - nd->file = NULL; - } + if (base) + fput(base); if (nd->root.mnt) { path_put(&nd->root); diff --git a/include/linux/namei.h b/include/linux/namei.h index 265378a707bd..72ffd62ac736 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -19,7 +19,6 @@ struct nameidata { struct path path; struct qstr last; struct path root; - struct file *file; struct inode *inode; /* path.dentry.d_inode */ unsigned int flags; unsigned seq; From fe2d35ff0d18a2c93993b0d7d46f846ff4331b72 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 5 Mar 2011 22:58:25 -0500 Subject: [PATCH 653/706] switch non-create side of open() to use of do_last() Instead of path_lookupat() doing trailing symlink resolution, use the same scheme as on the O_CREAT side. Walk with LOOKUP_PARENT, then (in do_last()) look the final component up, then either open it or return error or, if it's a symlink, give the symlink back to path_openat() to be resolved there. The really messy complication here is RCU. We don't want to drop out of RCU mode before the final lookup, since we don't want to bounce parent directory ->d_count without a good reason. Result is _not_ pretty; later in the series we'll clean it up. For now we are roughly back where we'd been before the revert done by Nick's series - top-level logics of path_openat() is cleaned up, do_last() does actual opening, symlink resolution is done uniformly. Signed-off-by: Al Viro --- fs/namei.c | 100 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 67 insertions(+), 33 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index a260a306daf5..9595b4a55c39 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2178,13 +2178,14 @@ exit: } /* - * Handle O_CREAT case for do_filp_open + * Handle the last step of open() */ static struct file *do_last(struct nameidata *nd, struct path *path, const struct open_flags *op, const char *pathname) { struct dentry *dir = nd->path.dentry; struct file *filp; + struct inode *inode; int error; nd->flags &= ~LOOKUP_PARENT; @@ -2192,17 +2193,27 @@ static struct file *do_last(struct nameidata *nd, struct path *path, switch (nd->last_type) { case LAST_DOTDOT: - follow_dotdot(nd); - dir = nd->path.dentry; case LAST_DOT: + error = handle_dots(nd, nd->last_type); + if (error) + return ERR_PTR(error); /* fallthrough */ case LAST_ROOT: + if (nd->flags & LOOKUP_RCU) { + if (nameidata_drop_rcu_last(nd)) + return ERR_PTR(-ECHILD); + } error = handle_reval_path(nd); if (error) goto exit; - error = -EISDIR; - goto exit; + audit_inode(pathname, nd->path.dentry); + if (op->open_flag & O_CREAT) { + error = -EISDIR; + goto exit; + } + goto ok; case LAST_BIND: + /* can't be RCU mode here */ error = handle_reval_path(nd); if (error) goto exit; @@ -2210,6 +2221,51 @@ static struct file *do_last(struct nameidata *nd, struct path *path, goto ok; } + if (!(op->open_flag & O_CREAT)) { + if (nd->last.name[nd->last.len]) + nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; + /* we _can_ be in RCU mode here */ + error = do_lookup(nd, &nd->last, path, &inode); + if (error) { + terminate_walk(nd); + return ERR_PTR(error); + } + if (!inode) { + path_to_nameidata(path, nd); + terminate_walk(nd); + return ERR_PTR(-ENOENT); + } + if (unlikely(inode->i_op->follow_link)) { + /* We drop rcu-walk here */ + if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry)) + return ERR_PTR(-ECHILD); + return NULL; + } + path_to_nameidata(path, nd); + nd->inode = inode; + /* sayonara */ + if (nd->flags & LOOKUP_RCU) { + if (nameidata_drop_rcu_last(nd)) + return ERR_PTR(-ECHILD); + } + + error = -ENOTDIR; + if (nd->flags & LOOKUP_DIRECTORY) { + if (!inode->i_op->lookup) + goto exit; + } + audit_inode(pathname, nd->path.dentry); + goto ok; + } + + /* create side of things */ + + if (nd->flags & LOOKUP_RCU) { + if (nameidata_drop_rcu_last(nd)) + return ERR_PTR(-ECHILD); + } + + audit_inode(pathname, dir); error = -EISDIR; /* trailing slashes? */ if (nd->last.name[nd->last.len]) @@ -2303,6 +2359,7 @@ exit: static struct file *path_openat(int dfd, const char *pathname, const struct open_flags *op, int flags) { + struct file *base = NULL; struct file *filp; struct nameidata nd; struct path path; @@ -2318,39 +2375,15 @@ static struct file *path_openat(int dfd, const char *pathname, nd.intent.open.flags = open_to_namei_flags(op->open_flag); nd.intent.open.create_mode = op->mode; - if (op->open_flag & O_CREAT) - goto creat; - - /* !O_CREAT, simple open */ - error = path_lookupat(dfd, pathname, flags | op->intent, &nd); + error = path_init(dfd, pathname, flags | LOOKUP_PARENT, &nd, &base); if (unlikely(error)) goto out_filp; - error = -ELOOP; - if (!(nd.flags & LOOKUP_FOLLOW)) { - if (nd.inode->i_op->follow_link) - goto out_path; - } - error = -ENOTDIR; - if (nd.flags & LOOKUP_DIRECTORY) { - if (!nd.inode->i_op->lookup) - goto out_path; - } - audit_inode(pathname, nd.path.dentry); - filp = finish_open(&nd, op->open_flag, op->acc_mode); - release_open_intent(&nd); - return filp; -creat: - /* OK, have to create the file. Find the parent. */ - error = path_lookupat(dfd, pathname, LOOKUP_PARENT | flags, &nd); + current->total_link_count = 0; + error = link_path_walk(pathname, &nd); if (unlikely(error)) goto out_filp; - if (unlikely(!audit_dummy_context())) - audit_inode(pathname, nd.path.dentry); - /* - * We have the parent and last component. - */ filp = do_last(&nd, &path, op, pathname); while (unlikely(!filp)) { /* trailing symlink */ struct path link = path; @@ -2386,12 +2419,13 @@ creat: out: if (nd.root.mnt) path_put(&nd.root); + if (base) + fput(base); release_open_intent(&nd); return filp; exit_dput: path_put_conditional(&path, &nd); -out_path: path_put(&nd.path); out_filp: filp = ERR_PTR(error); From 6a96ba54418be740303765c0f52be028573cb99a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 7 Mar 2011 23:49:20 -0500 Subject: [PATCH 654/706] kill __lookup_one_len() only one caller left Signed-off-by: Al Viro --- fs/namei.c | 41 +++++++++++++++-------------------------- 1 file changed, 15 insertions(+), 26 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 9595b4a55c39..f6f3ef47bc74 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1759,28 +1759,6 @@ static struct dentry *lookup_hash(struct nameidata *nd) return __lookup_hash(&nd->last, nd->path.dentry, nd); } -static int __lookup_one_len(const char *name, struct qstr *this, - struct dentry *base, int len) -{ - unsigned long hash; - unsigned int c; - - this->name = name; - this->len = len; - if (!len) - return -EACCES; - - hash = init_name_hash(); - while (len--) { - c = *(const unsigned char *)name++; - if (c == '/' || c == '\0') - return -EACCES; - hash = partial_name_hash(c, hash); - } - this->hash = end_name_hash(hash); - return 0; -} - /** * lookup_one_len - filesystem helper to lookup single pathname component * @name: pathname component to lookup @@ -1794,14 +1772,25 @@ static int __lookup_one_len(const char *name, struct qstr *this, */ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) { - int err; struct qstr this; + unsigned long hash; + unsigned int c; WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex)); - err = __lookup_one_len(name, &this, base, len); - if (err) - return ERR_PTR(err); + this.name = name; + this.len = len; + if (!len) + return ERR_PTR(-EACCES); + + hash = init_name_hash(); + while (len--) { + c = *(const unsigned char *)name++; + if (c == '/' || c == '\0') + return ERR_PTR(-EACCES); + hash = partial_name_hash(c, hash); + } + this.hash = end_name_hash(hash); return __lookup_hash(&this, base, NULL); } From 5a202bcd75bbd2397136397961babbd8463416af Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 8 Mar 2011 14:17:44 -0500 Subject: [PATCH 655/706] sanitize pathname component hash calculation Lift it to lookup_one_len() and link_path_walk() resp. into the same place where we calculated default hash function of the same name. Signed-off-by: Al Viro --- fs/namei.c | 42 +++++++++++++++++++----------------------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index f6f3ef47bc74..d1a5dfeaf999 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1216,16 +1216,6 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, struct inode *dir; int err; - /* - * See if the low-level filesystem might want - * to use its own hash.. - */ - if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { - err = parent->d_op->d_hash(parent, nd->inode, name); - if (err < 0) - return err; - } - /* * Rename seqlock is not required here because in the off chance * of a false negative due to a concurrent rename, we're going to @@ -1414,8 +1404,16 @@ static int link_path_walk(const char *name, struct nameidata *nd) case 1: type = LAST_DOT; } - if (likely(type == LAST_NORM)) + if (likely(type == LAST_NORM)) { + struct dentry *parent = nd->path.dentry; nd->flags &= ~LOOKUP_JUMPED; + if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { + err = parent->d_op->d_hash(parent, nd->inode, + &this); + if (err < 0) + break; + } + } /* remove trailing slashes? */ if (!c) @@ -1722,17 +1720,6 @@ static struct dentry *__lookup_hash(struct qstr *name, if (err) return ERR_PTR(err); - /* - * See if the low-level filesystem might want - * to use its own hash.. - */ - if (base->d_flags & DCACHE_OP_HASH) { - err = base->d_op->d_hash(base, inode, name); - dentry = ERR_PTR(err); - if (err < 0) - goto out; - } - /* * Don't bother with __d_lookup: callers are for creat as * well as unlink, so a lot of the time it would cost @@ -1745,7 +1732,7 @@ static struct dentry *__lookup_hash(struct qstr *name, if (!dentry) dentry = d_alloc_and_lookup(base, name, nd); -out: + return dentry; } @@ -1791,6 +1778,15 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) hash = partial_name_hash(c, hash); } this.hash = end_name_hash(hash); + /* + * See if the low-level filesystem might want + * to use its own hash.. + */ + if (base->d_flags & DCACHE_OP_HASH) { + int err = base->d_op->d_hash(base, base->d_inode, &this); + if (err < 0) + return ERR_PTR(err); + } return __lookup_hash(&this, base, NULL); } From 0f9d1a10c341020617e5b1c7f9c16f6a070438ec Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 9 Mar 2011 00:13:14 -0500 Subject: [PATCH 656/706] expand finish_open() in its only caller Signed-off-by: Al Viro --- fs/namei.c | 90 +++++++++++++++++++++++------------------------------- 1 file changed, 38 insertions(+), 52 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index d1a5dfeaf999..1f561dc495a1 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2111,57 +2111,6 @@ static int open_will_truncate(int flag, struct inode *inode) return (flag & O_TRUNC); } -static struct file *finish_open(struct nameidata *nd, - int open_flag, int acc_mode) -{ - struct file *filp; - int will_truncate; - int error; - - will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode); - if (will_truncate) { - error = mnt_want_write(nd->path.mnt); - if (error) - goto exit; - } - error = may_open(&nd->path, acc_mode, open_flag); - if (error) { - if (will_truncate) - mnt_drop_write(nd->path.mnt); - goto exit; - } - filp = nameidata_to_filp(nd); - if (!IS_ERR(filp)) { - error = ima_file_check(filp, acc_mode); - if (error) { - fput(filp); - filp = ERR_PTR(error); - } - } - if (!IS_ERR(filp)) { - if (will_truncate) { - error = handle_truncate(filp); - if (error) { - fput(filp); - filp = ERR_PTR(error); - } - } - } - /* - * It is now safe to drop the mnt write - * because the filp has had a write taken - * on its behalf. - */ - if (will_truncate) - mnt_drop_write(nd->path.mnt); - path_put(&nd->path); - return filp; - -exit: - path_put(&nd->path); - return ERR_PTR(error); -} - /* * Handle the last step of open() */ @@ -2169,6 +2118,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, const struct open_flags *op, const char *pathname) { struct dentry *dir = nd->path.dentry; + int will_truncate; struct file *filp; struct inode *inode; int error; @@ -2329,7 +2279,43 @@ static struct file *do_last(struct nameidata *nd, struct path *path, if (S_ISDIR(nd->inode->i_mode)) goto exit; ok: - filp = finish_open(nd, op->open_flag, op->acc_mode); + will_truncate = open_will_truncate(op->open_flag, nd->path.dentry->d_inode); + if (will_truncate) { + error = mnt_want_write(nd->path.mnt); + if (error) + goto exit; + } + error = may_open(&nd->path, op->acc_mode, op->open_flag); + if (error) { + if (will_truncate) + mnt_drop_write(nd->path.mnt); + goto exit; + } + filp = nameidata_to_filp(nd); + if (!IS_ERR(filp)) { + error = ima_file_check(filp, op->acc_mode); + if (error) { + fput(filp); + filp = ERR_PTR(error); + } + } + if (!IS_ERR(filp)) { + if (will_truncate) { + error = handle_truncate(filp); + if (error) { + fput(filp); + filp = ERR_PTR(error); + } + } + } + /* + * It is now safe to drop the mnt write + * because the filp has had a write taken + * on its behalf. + */ + if (will_truncate) + mnt_drop_write(nd->path.mnt); + path_put(&nd->path); return filp; exit_mutex_unlock: From 9b44f1b3928b6f41532c9a1dc9a6fc665989ad5b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 9 Mar 2011 00:17:27 -0500 Subject: [PATCH 657/706] move may_open() from __open_name_create() to do_last() Signed-off-by: Al Viro --- fs/namei.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 1f561dc495a1..def63e7c058d 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2069,11 +2069,7 @@ out_unlock: mutex_unlock(&dir->d_inode->i_mutex); dput(nd->path.dentry); nd->path.dentry = path->dentry; - - if (error) - return error; - /* Don't check for write permission, don't truncate */ - return may_open(&nd->path, 0, open_flag & ~O_TRUNC); + return error; } /* @@ -2239,6 +2235,12 @@ static struct file *do_last(struct nameidata *nd, struct path *path, mnt_drop_write(nd->path.mnt); goto exit; } + /* Don't check for write permission, don't truncate */ + error = may_open(&nd->path, 0, op->open_flag & ~O_TRUNC); + if (error) { + mnt_drop_write(nd->path.mnt); + goto exit; + } filp = nameidata_to_filp(nd); mnt_drop_write(nd->path.mnt); path_put(&nd->path); From ca344a894b41a133dab07dfbbdf652c053f6658c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 9 Mar 2011 00:36:45 -0500 Subject: [PATCH 658/706] do_last: unify may_open() call and everyting after it We have a bunch of diverging codepaths in do_last(); some of them converge, but the case of having to create a new file duplicates large part of common tail of the rest and exits separately. Massage them so that they could be merged. Signed-off-by: Al Viro --- fs/namei.c | 59 ++++++++++++++++++++---------------------------------- 1 file changed, 22 insertions(+), 37 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index def63e7c058d..63844776484c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2114,7 +2114,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path, const struct open_flags *op, const char *pathname) { struct dentry *dir = nd->path.dentry; + int open_flag = op->open_flag; int will_truncate; + int want_write = 0; + int skip_perm = 0; struct file *filp; struct inode *inode; int error; @@ -2138,7 +2141,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, if (error) goto exit; audit_inode(pathname, nd->path.dentry); - if (op->open_flag & O_CREAT) { + if (open_flag & O_CREAT) { error = -EISDIR; goto exit; } @@ -2152,7 +2155,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, goto ok; } - if (!(op->open_flag & O_CREAT)) { + if (!(open_flag & O_CREAT)) { if (nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; /* we _can_ be in RCU mode here */ @@ -2230,28 +2233,15 @@ static struct file *do_last(struct nameidata *nd, struct path *path, error = mnt_want_write(nd->path.mnt); if (error) goto exit_mutex_unlock; - error = __open_namei_create(nd, path, op->open_flag, op->mode); - if (error) { - mnt_drop_write(nd->path.mnt); + want_write = 1; + will_truncate = 0; + error = __open_namei_create(nd, path, open_flag, op->mode); + if (error) goto exit; - } /* Don't check for write permission, don't truncate */ - error = may_open(&nd->path, 0, op->open_flag & ~O_TRUNC); - if (error) { - mnt_drop_write(nd->path.mnt); - goto exit; - } - filp = nameidata_to_filp(nd); - mnt_drop_write(nd->path.mnt); - path_put(&nd->path); - if (!IS_ERR(filp)) { - error = ima_file_check(filp, op->acc_mode); - if (error) { - fput(filp); - filp = ERR_PTR(error); - } - } - return filp; + open_flag &= ~O_TRUNC; + skip_perm = 1; + goto common; } /* @@ -2261,7 +2251,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, audit_inode(pathname, path->dentry); error = -EEXIST; - if (op->open_flag & O_EXCL) + if (open_flag & O_EXCL) goto exit_dput; error = follow_managed(path, nd->flags); @@ -2281,18 +2271,17 @@ static struct file *do_last(struct nameidata *nd, struct path *path, if (S_ISDIR(nd->inode->i_mode)) goto exit; ok: - will_truncate = open_will_truncate(op->open_flag, nd->path.dentry->d_inode); + will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode); if (will_truncate) { error = mnt_want_write(nd->path.mnt); if (error) goto exit; + want_write = 1; } - error = may_open(&nd->path, op->acc_mode, op->open_flag); - if (error) { - if (will_truncate) - mnt_drop_write(nd->path.mnt); +common: + error = may_open(&nd->path, skip_perm ? 0 : op->acc_mode, open_flag); + if (error) goto exit; - } filp = nameidata_to_filp(nd); if (!IS_ERR(filp)) { error = ima_file_check(filp, op->acc_mode); @@ -2310,12 +2299,8 @@ ok: } } } - /* - * It is now safe to drop the mnt write - * because the filp has had a write taken - * on its behalf. - */ - if (will_truncate) +out: + if (want_write) mnt_drop_write(nd->path.mnt); path_put(&nd->path); return filp; @@ -2325,8 +2310,8 @@ exit_mutex_unlock: exit_dput: path_put_conditional(path, nd); exit: - path_put(&nd->path); - return ERR_PTR(error); + filp = ERR_PTR(error); + goto out; } static struct file *path_openat(int dfd, const char *pathname, From 6c0d46c493217cf48999b3f8808910ae534aa085 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 9 Mar 2011 00:59:59 -0500 Subject: [PATCH 659/706] fold __open_namei_create() and open_will_truncate() into do_last() ... and clean up a bit more Signed-off-by: Al Viro --- fs/namei.c | 74 +++++++++++++++++++----------------------------------- 1 file changed, 26 insertions(+), 48 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 63844776484c..441f1106de08 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2048,30 +2048,6 @@ static int handle_truncate(struct file *filp) return error; } -/* - * Be careful about ever adding any more callers of this - * function. Its flags must be in the namei format, not - * what get passed to sys_open(). - */ -static int __open_namei_create(struct nameidata *nd, struct path *path, - int open_flag, int mode) -{ - int error; - struct dentry *dir = nd->path.dentry; - - if (!IS_POSIXACL(dir->d_inode)) - mode &= ~current_umask(); - error = security_path_mknod(&nd->path, path->dentry, mode, 0); - if (error) - goto out_unlock; - error = vfs_create(dir->d_inode, path->dentry, mode, nd); -out_unlock: - mutex_unlock(&dir->d_inode->i_mutex); - dput(nd->path.dentry); - nd->path.dentry = path->dentry; - return error; -} - /* * Note that while the flag value (low two bits) for sys_open means: * 00 - read-only @@ -2096,17 +2072,6 @@ static inline int open_to_namei_flags(int flag) return flag; } -static int open_will_truncate(int flag, struct inode *inode) -{ - /* - * We'll never write to the fs underlying - * a device file. - */ - if (special_file(inode->i_mode)) - return 0; - return (flag & O_TRUNC); -} - /* * Handle the last step of open() */ @@ -2114,8 +2079,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path, const struct open_flags *op, const char *pathname) { struct dentry *dir = nd->path.dentry; + struct dentry *dentry; int open_flag = op->open_flag; - int will_truncate; + int will_truncate = open_flag & O_TRUNC; int want_write = 0; int skip_perm = 0; struct file *filp; @@ -2207,25 +2173,29 @@ static struct file *do_last(struct nameidata *nd, struct path *path, mutex_lock(&dir->d_inode->i_mutex); - path->dentry = lookup_hash(nd); - path->mnt = nd->path.mnt; - - error = PTR_ERR(path->dentry); - if (IS_ERR(path->dentry)) { + dentry = lookup_hash(nd); + error = PTR_ERR(dentry); + if (IS_ERR(dentry)) { mutex_unlock(&dir->d_inode->i_mutex); goto exit; } + path->dentry = dentry; + path->mnt = nd->path.mnt; + if (IS_ERR(nd->intent.open.file)) { error = PTR_ERR(nd->intent.open.file); goto exit_mutex_unlock; } /* Negative dentry, just create the file */ - if (!path->dentry->d_inode) { + if (!dentry->d_inode) { + int mode = op->mode; + if (!IS_POSIXACL(dir->d_inode)) + mode &= ~current_umask(); /* * This write is needed to ensure that a - * ro->rw transition does not occur between + * rw->ro transition does not occur between * the time when the file is created and when * a permanent write count is taken through * the 'struct file' in nameidata_to_filp(). @@ -2234,13 +2204,19 @@ static struct file *do_last(struct nameidata *nd, struct path *path, if (error) goto exit_mutex_unlock; want_write = 1; - will_truncate = 0; - error = __open_namei_create(nd, path, open_flag, op->mode); - if (error) - goto exit; /* Don't check for write permission, don't truncate */ open_flag &= ~O_TRUNC; + will_truncate = 0; skip_perm = 1; + error = security_path_mknod(&nd->path, dentry, mode, 0); + if (error) + goto exit_mutex_unlock; + error = vfs_create(dir->d_inode, dentry, mode, nd); + if (error) + goto exit_mutex_unlock; + mutex_unlock(&dir->d_inode->i_mutex); + dput(nd->path.dentry); + nd->path.dentry = dentry; goto common; } @@ -2271,7 +2247,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path, if (S_ISDIR(nd->inode->i_mode)) goto exit; ok: - will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode); + if (!S_ISREG(nd->inode->i_mode)) + will_truncate = 0; + if (will_truncate) { error = mnt_want_write(nd->path.mnt); if (error) From f374ed5fa8afed8590deaae5dc147422e0e1a6d9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 9 Mar 2011 01:34:45 -0500 Subject: [PATCH 660/706] do_last: kill a rudiment of old ->d_revalidate() workaround There used to be time when ->d_revalidate() couldn't return an error. So intents code had lookup_instantiate_filp() stash ERR_PTR(error) in nd->intent.open.filp and had it checked after lookup_hash(), to catch the otherwise silent failures. That had been introduced by commit 4af4c52f34606bdaab6930a845550c6fb02078a4. These days ->d_revalidate() can and does propagate errors back to callers explicitly, so this check isn't needed anymore. Signed-off-by: Al Viro --- fs/namei.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 441f1106de08..6972e761286b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2183,11 +2183,6 @@ static struct file *do_last(struct nameidata *nd, struct path *path, path->dentry = dentry; path->mnt = nd->path.mnt; - if (IS_ERR(nd->intent.open.file)) { - error = PTR_ERR(nd->intent.open.file); - goto exit_mutex_unlock; - } - /* Negative dentry, just create the file */ if (!dentry->d_inode) { int mode = op->mode; From 40b39136f07279fdc868a36cba050f4e84ce0ace Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 9 Mar 2011 16:22:18 -0500 Subject: [PATCH 661/706] path_openat: clean ELOOP handling a bit Signed-off-by: Al Viro --- fs/namei.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 6972e761286b..ca9a06a65704 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2320,11 +2320,12 @@ static struct file *path_openat(int dfd, const char *pathname, struct path link = path; struct inode *linki = link.dentry->d_inode; void *cookie; - error = -ELOOP; - if (!(nd.flags & LOOKUP_FOLLOW)) - goto exit_dput; - if (count++ == 32) - goto exit_dput; + if (!(nd.flags & LOOKUP_FOLLOW) || count++ == 32) { + path_put_conditional(&path, &nd); + path_put(&nd.path); + filp = ERR_PTR(-ELOOP); + break; + } /* * This is subtle. Instead of calling do_follow_link() we do * the thing by hands. The reason is that this way we have zero @@ -2355,9 +2356,6 @@ out: release_open_intent(&nd); return filp; -exit_dput: - path_put_conditional(&path, &nd); - path_put(&nd.path); out_filp: filp = ERR_PTR(error); goto out; From 5a18fff2090c3af830d699c8ccb230498a1e37e5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 11 Mar 2011 04:44:53 -0500 Subject: [PATCH 662/706] untangle do_lookup() That thing has devolved into rats nest of gotos; sane use of unlikely() gets rid of that horror and gives much more readable structure: * make a fast attempt to find a dentry; false negatives are OK. In RCU mode if everything went fine, we are done, otherwise just drop out of RCU. If we'd done (RCU) ->d_revalidate() and it had not refused outright (i.e. didn't give us -ECHILD), remember its result. * now we are not in RCU mode and hopefully have a dentry. If we do not, lock parent, do full d_lookup() and if that has not found anything, allocate and call ->lookup(). If we'd done that ->lookup(), remember that dentry is good and we don't need to revalidate it. * now we have a dentry. If it has ->d_revalidate() and we can't skip it, call it. * hopefully dentry is good; if not, either fail (in case of error) or try to invalidate it. If d_invalidate() has succeeded, drop it and retry everything as if original attempt had not found a dentry. * now we can finish it up - deal with mountpoint crossing and automount. Signed-off-by: Al Viro --- fs/namei.c | 141 +++++++++++++++++++++-------------------------------- 1 file changed, 56 insertions(+), 85 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index ca9a06a65704..0bebd13e5cb7 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -589,29 +589,6 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd) return dentry; } -static inline struct dentry * -do_revalidate_rcu(struct dentry *dentry, struct nameidata *nd) -{ - int status = d_revalidate(dentry, nd); - if (likely(status > 0)) - return dentry; - if (status == -ECHILD) { - if (nameidata_dentry_drop_rcu(nd, dentry)) - return ERR_PTR(-ECHILD); - return do_revalidate(dentry, nd); - } - if (status < 0) - return ERR_PTR(status); - /* Don't d_invalidate in rcu-walk mode */ - if (nameidata_dentry_drop_rcu(nd, dentry)) - return ERR_PTR(-ECHILD); - if (!d_invalidate(dentry)) { - dput(dentry); - dentry = NULL; - } - return dentry; -} - /* * handle_reval_path - force revalidation of a dentry * @@ -1213,7 +1190,8 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, { struct vfsmount *mnt = nd->path.mnt; struct dentry *dentry, *parent = nd->path.dentry; - struct inode *dir; + int need_reval = 1; + int status = 1; int err; /* @@ -1223,48 +1201,74 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, */ if (nd->flags & LOOKUP_RCU) { unsigned seq; - *inode = nd->inode; dentry = __d_lookup_rcu(parent, name, &seq, inode); - if (!dentry) { - if (nameidata_drop_rcu(nd)) - return -ECHILD; - goto need_lookup; - } + if (!dentry) + goto unlazy; + /* Memory barrier in read_seqcount_begin of child is enough */ if (__read_seqcount_retry(&parent->d_seq, nd->seq)) return -ECHILD; - nd->seq = seq; + if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { - dentry = do_revalidate_rcu(dentry, nd); - if (!dentry) - goto need_lookup; - if (IS_ERR(dentry)) - goto fail; - if (!(nd->flags & LOOKUP_RCU)) - goto done; + status = d_revalidate(dentry, nd); + if (unlikely(status <= 0)) { + if (status != -ECHILD) + need_reval = 0; + goto unlazy; + } } path->mnt = mnt; path->dentry = dentry; if (likely(__follow_mount_rcu(nd, path, inode, false))) return 0; - if (nameidata_drop_rcu(nd)) - return -ECHILD; - /* fallthru */ +unlazy: + if (dentry) { + if (nameidata_dentry_drop_rcu(nd, dentry)) + return -ECHILD; + } else { + if (nameidata_drop_rcu(nd)) + return -ECHILD; + } + } else { + dentry = __d_lookup(parent, name); } - dentry = __d_lookup(parent, name); - if (!dentry) - goto need_lookup; -found: - if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { - dentry = do_revalidate(dentry, nd); - if (!dentry) - goto need_lookup; - if (IS_ERR(dentry)) - goto fail; + +retry: + if (unlikely(!dentry)) { + struct inode *dir = parent->d_inode; + BUG_ON(nd->inode != dir); + + mutex_lock(&dir->i_mutex); + dentry = d_lookup(parent, name); + if (likely(!dentry)) { + dentry = d_alloc_and_lookup(parent, name, nd); + if (IS_ERR(dentry)) { + mutex_unlock(&dir->i_mutex); + return PTR_ERR(dentry); + } + /* known good */ + need_reval = 0; + status = 1; + } + mutex_unlock(&dir->i_mutex); } -done: + if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval) + status = d_revalidate(dentry, nd); + if (unlikely(status <= 0)) { + if (status < 0) { + dput(dentry); + return status; + } + if (!d_invalidate(dentry)) { + dput(dentry); + dentry = NULL; + need_reval = 1; + goto retry; + } + } + path->mnt = mnt; path->dentry = dentry; err = follow_managed(path, nd->flags); @@ -1274,39 +1278,6 @@ done: } *inode = path->dentry->d_inode; return 0; - -need_lookup: - dir = parent->d_inode; - BUG_ON(nd->inode != dir); - - mutex_lock(&dir->i_mutex); - /* - * First re-do the cached lookup just in case it was created - * while we waited for the directory semaphore, or the first - * lookup failed due to an unrelated rename. - * - * This could use version numbering or similar to avoid unnecessary - * cache lookups, but then we'd have to do the first lookup in the - * non-racy way. However in the common case here, everything should - * be hot in cache, so would it be a big win? - */ - dentry = d_lookup(parent, name); - if (likely(!dentry)) { - dentry = d_alloc_and_lookup(parent, name, nd); - mutex_unlock(&dir->i_mutex); - if (IS_ERR(dentry)) - goto fail; - goto done; - } - /* - * Uhhuh! Nasty case: the cache was re-populated while - * we waited on the semaphore. Need to revalidate. - */ - mutex_unlock(&dir->i_mutex); - goto found; - -fail: - return PTR_ERR(dentry); } static inline int may_lookup(struct nameidata *nd) From 5b6ca027d85b7438c84b78a54ccdc2e53f2909cd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 9 Mar 2011 23:04:47 -0500 Subject: [PATCH 663/706] reduce vfs_path_lookup() to do_path_lookup() New lookup flag: LOOKUP_ROOT. nd->root is set (and held) by caller, path_init() starts walking from that place and all pathname resolution machinery never drops nd->root if that flag is set. That turns vfs_path_lookup() into a special case of do_path_lookup() *and* gets us down to 3 callers of link_path_walk(), making it finally feasible to rip the handling of trailing symlink out of link_path_walk(). That will not only simply the living hell out of it, but make life much simpler for unionfs merge. Trailing symlink handling will become iterative, which is a good thing for stack footprint in a lot of situations as well. Signed-off-by: Al Viro --- fs/namei.c | 95 ++++++++++++++++++++----------------------- include/linux/namei.h | 1 + 2 files changed, 44 insertions(+), 52 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 0bebd13e5cb7..8ee7785d5642 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -401,9 +401,11 @@ static int nameidata_drop_rcu(struct nameidata *nd) { struct fs_struct *fs = current->fs; struct dentry *dentry = nd->path.dentry; + int want_root = 0; BUG_ON(!(nd->flags & LOOKUP_RCU)); - if (nd->root.mnt) { + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { + want_root = 1; spin_lock(&fs->lock); if (nd->root.mnt != fs->root.mnt || nd->root.dentry != fs->root.dentry) @@ -414,7 +416,7 @@ static int nameidata_drop_rcu(struct nameidata *nd) goto err; BUG_ON(nd->inode != dentry->d_inode); spin_unlock(&dentry->d_lock); - if (nd->root.mnt) { + if (want_root) { path_get(&nd->root); spin_unlock(&fs->lock); } @@ -427,7 +429,7 @@ static int nameidata_drop_rcu(struct nameidata *nd) err: spin_unlock(&dentry->d_lock); err_root: - if (nd->root.mnt) + if (want_root) spin_unlock(&fs->lock); return -ECHILD; } @@ -454,9 +456,11 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry { struct fs_struct *fs = current->fs; struct dentry *parent = nd->path.dentry; + int want_root = 0; BUG_ON(!(nd->flags & LOOKUP_RCU)); - if (nd->root.mnt) { + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { + want_root = 1; spin_lock(&fs->lock); if (nd->root.mnt != fs->root.mnt || nd->root.dentry != fs->root.dentry) @@ -476,7 +480,7 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry parent->d_count++; spin_unlock(&dentry->d_lock); spin_unlock(&parent->d_lock); - if (nd->root.mnt) { + if (want_root) { path_get(&nd->root); spin_unlock(&fs->lock); } @@ -490,7 +494,7 @@ err: spin_unlock(&dentry->d_lock); spin_unlock(&parent->d_lock); err_root: - if (nd->root.mnt) + if (want_root) spin_unlock(&fs->lock); return -ECHILD; } @@ -501,7 +505,8 @@ static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct d if (nd->flags & LOOKUP_RCU) { if (unlikely(nameidata_dentry_drop_rcu(nd, dentry))) { nd->flags &= ~LOOKUP_RCU; - nd->root.mnt = NULL; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; rcu_read_unlock(); br_read_unlock(vfsmount_lock); return -ECHILD; @@ -525,7 +530,8 @@ static int nameidata_drop_rcu_last(struct nameidata *nd) BUG_ON(!(nd->flags & LOOKUP_RCU)); nd->flags &= ~LOOKUP_RCU; - nd->root.mnt = NULL; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; spin_lock(&dentry->d_lock); if (!__d_rcu_to_refcount(dentry, nd->seq)) goto err_unlock; @@ -1053,7 +1059,8 @@ static int follow_dotdot_rcu(struct nameidata *nd) failed: nd->flags &= ~LOOKUP_RCU; - nd->root.mnt = NULL; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; rcu_read_unlock(); br_read_unlock(vfsmount_lock); return -ECHILD; @@ -1310,7 +1317,8 @@ static void terminate_walk(struct nameidata *nd) path_put(&nd->path); } else { nd->flags &= ~LOOKUP_RCU; - nd->root.mnt = NULL; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; rcu_read_unlock(); br_read_unlock(vfsmount_lock); } @@ -1477,6 +1485,25 @@ static int path_init(int dfd, const char *name, unsigned int flags, nd->last_type = LAST_ROOT; /* if there are only slashes... */ nd->flags = flags | LOOKUP_JUMPED; nd->depth = 0; + if (flags & LOOKUP_ROOT) { + struct inode *inode = nd->root.dentry->d_inode; + if (!inode->i_op->lookup) + return -ENOTDIR; + retval = inode_permission(inode, MAY_EXEC); + if (retval) + return retval; + nd->path = nd->root; + nd->inode = inode; + if (flags & LOOKUP_RCU) { + br_read_lock(vfsmount_lock); + rcu_read_lock(); + nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + } else { + path_get(&nd->path); + } + return 0; + } + nd->root.mnt = NULL; if (*name=='/') { @@ -1587,7 +1614,7 @@ static int path_lookupat(int dfd, const char *name, if (base) fput(base); - if (nd->root.mnt) { + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { path_put(&nd->root); nd->root.mnt = NULL; } @@ -1638,46 +1665,10 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, const char *name, unsigned int flags, struct nameidata *nd) { - int result; - - /* same as do_path_lookup */ - nd->last_type = LAST_ROOT; - nd->flags = flags | LOOKUP_JUMPED; - nd->depth = 0; - - nd->path.dentry = dentry; - nd->path.mnt = mnt; - path_get(&nd->path); - nd->root = nd->path; - path_get(&nd->root); - nd->inode = nd->path.dentry->d_inode; - - current->total_link_count = 0; - - result = link_path_walk(name, nd); - if (!result) - result = handle_reval_path(nd); - if (result == -ESTALE) { - /* nd->path had been dropped */ - current->total_link_count = 0; - nd->path.dentry = dentry; - nd->path.mnt = mnt; - nd->inode = dentry->d_inode; - path_get(&nd->path); - nd->flags = flags | LOOKUP_JUMPED | LOOKUP_REVAL; - - result = link_path_walk(name, nd); - if (!result) - result = handle_reval_path(nd); - } - if (unlikely(!result && !audit_dummy_context() && nd->path.dentry && - nd->inode)) - audit_inode(name, nd->path.dentry); - - path_put(&nd->root); - nd->root.mnt = NULL; - - return result; + nd->root.dentry = dentry; + nd->root.mnt = mnt; + /* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */ + return do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, nd); } static struct dentry *__lookup_hash(struct qstr *name, @@ -2320,7 +2311,7 @@ static struct file *path_openat(int dfd, const char *pathname, path_put(&link); } out: - if (nd.root.mnt) + if (nd.root.mnt && !(nd.flags & LOOKUP_ROOT)) path_put(&nd.root); if (base) fput(base); diff --git a/include/linux/namei.h b/include/linux/namei.h index 72ffd62ac736..83cd6e5cd7dc 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -63,6 +63,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; #define LOOKUP_RENAME_TARGET 0x0800 #define LOOKUP_JUMPED 0x1000 +#define LOOKUP_ROOT 0x2000 extern int user_path_at(int, const char __user *, unsigned, struct path *); From 73d049a40fc6269189c4e2ba6792cb5dd054883c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 11 Mar 2011 12:08:24 -0500 Subject: [PATCH 664/706] open-style analog of vfs_path_lookup() new function: file_open_root(dentry, mnt, name, flags) opens the file vfs_path_lookup would arrive to. Note that name can be empty; in that case the usual requirement that dentry should be a directory is lifted. open-coded equivalents switched to it, may_open() got down exactly one caller and became static. Signed-off-by: Al Viro --- arch/um/drivers/mconsole_kern.c | 21 +-------- fs/internal.h | 2 + fs/namei.c | 80 +++++++++++++++++++++------------ fs/nfsctl.c | 21 ++------- fs/open.c | 14 ++++++ include/linux/fs.h | 4 +- kernel/sysctl_binary.c | 19 +------- 7 files changed, 77 insertions(+), 84 deletions(-) diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index 975613b23dcf..c70e047eed72 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -124,35 +124,18 @@ void mconsole_log(struct mc_request *req) #if 0 void mconsole_proc(struct mc_request *req) { - struct nameidata nd; struct vfsmount *mnt = current->nsproxy->pid_ns->proc_mnt; struct file *file; - int n, err; + int n; char *ptr = req->request.data, *buf; mm_segment_t old_fs = get_fs(); ptr += strlen("proc"); ptr = skip_spaces(ptr); - err = vfs_path_lookup(mnt->mnt_root, mnt, ptr, LOOKUP_FOLLOW, &nd); - if (err) { - mconsole_reply(req, "Failed to look up file", 1, 0); - goto out; - } - - err = may_open(&nd.path, MAY_READ, O_RDONLY); - if (result) { - mconsole_reply(req, "Failed to open file", 1, 0); - path_put(&nd.path); - goto out; - } - - file = dentry_open(nd.path.dentry, nd.path.mnt, O_RDONLY, - current_cred()); - err = PTR_ERR(file); + file = file_open_root(mnt->mnt_root, mnt, ptr, O_RDONLY); if (IS_ERR(file)) { mconsole_reply(req, "Failed to open file", 1, 0); - path_put(&nd.path); goto out; } diff --git a/fs/internal.h b/fs/internal.h index 6fdbdf2c6047..52abc5287f50 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -114,6 +114,8 @@ struct open_flags { }; extern struct file *do_filp_open(int dfd, const char *pathname, const struct open_flags *op, int lookup_flags); +extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, + const char *, const struct open_flags *, int lookup_flags); /* * inode.c diff --git a/fs/namei.c b/fs/namei.c index 8ee7785d5642..abc8d2df121c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1487,11 +1487,13 @@ static int path_init(int dfd, const char *name, unsigned int flags, nd->depth = 0; if (flags & LOOKUP_ROOT) { struct inode *inode = nd->root.dentry->d_inode; - if (!inode->i_op->lookup) - return -ENOTDIR; - retval = inode_permission(inode, MAY_EXEC); - if (retval) - return retval; + if (*name) { + if (!inode->i_op->lookup) + return -ENOTDIR; + retval = inode_permission(inode, MAY_EXEC); + if (retval) + return retval; + } nd->path = nd->root; nd->inode = inode; if (flags & LOOKUP_RCU) { @@ -1937,7 +1939,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode, return error; } -int may_open(struct path *path, int acc_mode, int flag) +static int may_open(struct path *path, int acc_mode, int flag) { struct dentry *dentry = path->dentry; struct inode *inode = dentry->d_inode; @@ -2250,11 +2252,10 @@ exit: } static struct file *path_openat(int dfd, const char *pathname, - const struct open_flags *op, int flags) + struct nameidata *nd, const struct open_flags *op, int flags) { struct file *base = NULL; struct file *filp; - struct nameidata nd; struct path path; int count = 0; int error; @@ -2264,27 +2265,27 @@ static struct file *path_openat(int dfd, const char *pathname, return ERR_PTR(-ENFILE); filp->f_flags = op->open_flag; - nd.intent.open.file = filp; - nd.intent.open.flags = open_to_namei_flags(op->open_flag); - nd.intent.open.create_mode = op->mode; + nd->intent.open.file = filp; + nd->intent.open.flags = open_to_namei_flags(op->open_flag); + nd->intent.open.create_mode = op->mode; - error = path_init(dfd, pathname, flags | LOOKUP_PARENT, &nd, &base); + error = path_init(dfd, pathname, flags | LOOKUP_PARENT, nd, &base); if (unlikely(error)) goto out_filp; current->total_link_count = 0; - error = link_path_walk(pathname, &nd); + error = link_path_walk(pathname, nd); if (unlikely(error)) goto out_filp; - filp = do_last(&nd, &path, op, pathname); + filp = do_last(nd, &path, op, pathname); while (unlikely(!filp)) { /* trailing symlink */ struct path link = path; struct inode *linki = link.dentry->d_inode; void *cookie; - if (!(nd.flags & LOOKUP_FOLLOW) || count++ == 32) { - path_put_conditional(&path, &nd); - path_put(&nd.path); + if (!(nd->flags & LOOKUP_FOLLOW) || count++ == 32) { + path_put_conditional(&path, nd); + path_put(&nd->path); filp = ERR_PTR(-ELOOP); break; } @@ -2299,23 +2300,23 @@ static struct file *path_openat(int dfd, const char *pathname, * have to putname() it when we are done. Procfs-like symlinks * just set LAST_BIND. */ - nd.flags |= LOOKUP_PARENT; - nd.flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); - error = __do_follow_link(&link, &nd, &cookie); + nd->flags |= LOOKUP_PARENT; + nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); + error = __do_follow_link(&link, nd, &cookie); if (unlikely(error)) filp = ERR_PTR(error); else - filp = do_last(&nd, &path, op, pathname); + filp = do_last(nd, &path, op, pathname); if (!IS_ERR(cookie) && linki->i_op->put_link) - linki->i_op->put_link(link.dentry, &nd, cookie); + linki->i_op->put_link(link.dentry, nd, cookie); path_put(&link); } out: - if (nd.root.mnt && !(nd.flags & LOOKUP_ROOT)) - path_put(&nd.root); + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) + path_put(&nd->root); if (base) fput(base); - release_open_intent(&nd); + release_open_intent(nd); return filp; out_filp: @@ -2326,16 +2327,39 @@ out_filp: struct file *do_filp_open(int dfd, const char *pathname, const struct open_flags *op, int flags) { + struct nameidata nd; struct file *filp; - filp = path_openat(dfd, pathname, op, flags | LOOKUP_RCU); + filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU); if (unlikely(filp == ERR_PTR(-ECHILD))) - filp = path_openat(dfd, pathname, op, flags); + filp = path_openat(dfd, pathname, &nd, op, flags); if (unlikely(filp == ERR_PTR(-ESTALE))) - filp = path_openat(dfd, pathname, op, flags | LOOKUP_REVAL); + filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL); return filp; } +struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, + const char *name, const struct open_flags *op, int flags) +{ + struct nameidata nd; + struct file *file; + + nd.root.mnt = mnt; + nd.root.dentry = dentry; + + flags |= LOOKUP_ROOT; + + if (dentry->d_inode->i_op->follow_link) + return ERR_PTR(-ELOOP); + + file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU); + if (unlikely(file == ERR_PTR(-ECHILD))) + file = path_openat(-1, name, &nd, op, flags); + if (unlikely(file == ERR_PTR(-ESTALE))) + file = path_openat(-1, name, &nd, op, flags | LOOKUP_REVAL); + return file; +} + /** * lookup_create - lookup a dentry, creating it if it doesn't exist * @nd: nameidata info diff --git a/fs/nfsctl.c b/fs/nfsctl.c index bf9cbd242ddd..124e8fcb0dd6 100644 --- a/fs/nfsctl.c +++ b/fs/nfsctl.c @@ -22,30 +22,17 @@ static struct file *do_open(char *name, int flags) { - struct nameidata nd; struct vfsmount *mnt; - int error; + struct file *file; mnt = do_kern_mount("nfsd", 0, "nfsd", NULL); if (IS_ERR(mnt)) return (struct file *)mnt; - error = vfs_path_lookup(mnt->mnt_root, mnt, name, 0, &nd); + file = file_open_root(mnt->mnt_root, mnt, name, flags); + mntput(mnt); /* drop do_kern_mount reference */ - if (error) - return ERR_PTR(error); - - if (flags == O_RDWR) - error = may_open(&nd.path, MAY_READ|MAY_WRITE, flags); - else - error = may_open(&nd.path, MAY_WRITE, flags); - - if (!error) - return dentry_open(nd.path.dentry, nd.path.mnt, flags, - current_cred()); - - path_put(&nd.path); - return ERR_PTR(error); + return file; } static struct { diff --git a/fs/open.c b/fs/open.c index d05e18c60bae..48afc5c139d2 100644 --- a/fs/open.c +++ b/fs/open.c @@ -959,6 +959,20 @@ struct file *filp_open(const char *filename, int flags, int mode) } EXPORT_SYMBOL(filp_open); +struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, + const char *filename, int flags) +{ + struct open_flags op; + int lookup = build_open_flags(flags, 0, &op); + if (flags & O_CREAT) + return ERR_PTR(-EINVAL); + if (!filename && (flags & O_DIRECTORY)) + if (!dentry->d_inode->i_op->lookup) + return ERR_PTR(-ENOTDIR); + return do_file_open_root(dentry, mnt, filename, &op, lookup); +} +EXPORT_SYMBOL(file_open_root); + long do_sys_open(int dfd, const char __user *filename, int flags, int mode) { struct open_flags op; diff --git a/include/linux/fs.h b/include/linux/fs.h index 9c75714f92c1..bf5c3c896072 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1990,6 +1990,8 @@ extern int do_fallocate(struct file *file, int mode, loff_t offset, extern long do_sys_open(int dfd, const char __user *filename, int flags, int mode); extern struct file *filp_open(const char *, int, int); +extern struct file *file_open_root(struct dentry *, struct vfsmount *, + const char *, int); extern struct file * dentry_open(struct dentry *, struct vfsmount *, int, const struct cred *); extern int filp_close(struct file *, fl_owner_t id); @@ -2205,8 +2207,6 @@ extern struct file *create_read_pipe(struct file *f, int flags); extern struct file *create_write_pipe(int flags); extern void free_write_pipe(struct file *); -extern int may_open(struct path *, int, int); - extern int kernel_read(struct file *, loff_t, char *, unsigned long); extern struct file * open_exec(const char *); diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index b875bedf7c9a..3b8e028b9601 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1321,13 +1321,11 @@ static ssize_t binary_sysctl(const int *name, int nlen, void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) { const struct bin_table *table = NULL; - struct nameidata nd; struct vfsmount *mnt; struct file *file; ssize_t result; char *pathname; int flags; - int acc_mode; pathname = sysctl_getname(name, nlen, &table); result = PTR_ERR(pathname); @@ -1337,28 +1335,17 @@ static ssize_t binary_sysctl(const int *name, int nlen, /* How should the sysctl be accessed? */ if (oldval && oldlen && newval && newlen) { flags = O_RDWR; - acc_mode = MAY_READ | MAY_WRITE; } else if (newval && newlen) { flags = O_WRONLY; - acc_mode = MAY_WRITE; } else if (oldval && oldlen) { flags = O_RDONLY; - acc_mode = MAY_READ; } else { result = 0; goto out_putname; } mnt = current->nsproxy->pid_ns->proc_mnt; - result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd); - if (result) - goto out_putname; - - result = may_open(&nd.path, acc_mode, flags); - if (result) - goto out_putpath; - - file = dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred()); + file = file_open_root(mnt->mnt_root, mnt, pathname, flags); result = PTR_ERR(file); if (IS_ERR(file)) goto out_putname; @@ -1370,10 +1357,6 @@ out_putname: putname(pathname); out: return result; - -out_putpath: - path_put(&nd.path); - goto out_putname; } From c8b91accfa1059d5565443193d89572eca2f5dd6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 12 Mar 2011 10:41:39 -0500 Subject: [PATCH 665/706] clean statfs-like syscalls up New helpers: user_statfs() and fd_statfs(), taking userland pathname and descriptor resp. and filling struct kstatfs. Syscalls of statfs family (native, compat and foreign - osf and hpux on alpha and parisc resp.) switched to those. Removes some boilerplate code, simplifies cleanup on errors... Signed-off-by: Al Viro --- arch/alpha/kernel/osf_sys.c | 36 ++------ arch/parisc/hpux/sys_hpux.c | 65 +++++-------- fs/compat.c | 68 ++++---------- fs/statfs.c | 176 +++++++++++++++++------------------- include/linux/fs.h | 2 + 5 files changed, 133 insertions(+), 214 deletions(-) diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index fe698b5045e9..376f22130791 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -230,44 +230,24 @@ linux_to_osf_statfs(struct kstatfs *linux_stat, struct osf_statfs __user *osf_st return copy_to_user(osf_stat, &tmp_stat, bufsiz) ? -EFAULT : 0; } -static int -do_osf_statfs(struct path *path, struct osf_statfs __user *buffer, - unsigned long bufsiz) +SYSCALL_DEFINE3(osf_statfs, const char __user *, pathname, + struct osf_statfs __user *, buffer, unsigned long, bufsiz) { struct kstatfs linux_stat; - int error = vfs_statfs(path, &linux_stat); + int error = user_statfs(pathname, &linux_stat); if (!error) error = linux_to_osf_statfs(&linux_stat, buffer, bufsiz); return error; } -SYSCALL_DEFINE3(osf_statfs, const char __user *, pathname, - struct osf_statfs __user *, buffer, unsigned long, bufsiz) -{ - struct path path; - int retval; - - retval = user_path(pathname, &path); - if (!retval) { - retval = do_osf_statfs(&path, buffer, bufsiz); - path_put(&path); - } - return retval; -} - SYSCALL_DEFINE3(osf_fstatfs, unsigned long, fd, struct osf_statfs __user *, buffer, unsigned long, bufsiz) { - struct file *file; - int retval; - - retval = -EBADF; - file = fget(fd); - if (file) { - retval = do_osf_statfs(&file->f_path, buffer, bufsiz); - fput(file); - } - return retval; + struct kstatfs linux_stat; + int error = fd_statfs(fd, &linux_stat); + if (!error) + error = linux_to_osf_statfs(&linux_stat, buffer, bufsiz); + return error; } /* diff --git a/arch/parisc/hpux/sys_hpux.c b/arch/parisc/hpux/sys_hpux.c index 30394081d9b6..6ab9580b0b00 100644 --- a/arch/parisc/hpux/sys_hpux.c +++ b/arch/parisc/hpux/sys_hpux.c @@ -185,26 +185,21 @@ struct hpux_statfs { int16_t f_pad; }; -static int do_statfs_hpux(struct path *path, struct hpux_statfs *buf) +static int do_statfs_hpux(struct kstatfs *st, struct hpux_statfs __user *p) { - struct kstatfs st; - int retval; - - retval = vfs_statfs(path, &st); - if (retval) - return retval; - - memset(buf, 0, sizeof(*buf)); - buf->f_type = st.f_type; - buf->f_bsize = st.f_bsize; - buf->f_blocks = st.f_blocks; - buf->f_bfree = st.f_bfree; - buf->f_bavail = st.f_bavail; - buf->f_files = st.f_files; - buf->f_ffree = st.f_ffree; - buf->f_fsid[0] = st.f_fsid.val[0]; - buf->f_fsid[1] = st.f_fsid.val[1]; - + struct hpux_statfs buf; + memset(&buf, 0, sizeof(buf)); + buf.f_type = st->f_type; + buf.f_bsize = st->f_bsize; + buf.f_blocks = st->f_blocks; + buf.f_bfree = st->f_bfree; + buf.f_bavail = st->f_bavail; + buf.f_files = st->f_files; + buf.f_ffree = st->f_ffree; + buf.f_fsid[0] = st->f_fsid.val[0]; + buf.f_fsid[1] = st->f_fsid.val[1]; + if (copy_to_user(p, &buf, sizeof(buf))) + return -EFAULT; return 0; } @@ -212,35 +207,19 @@ static int do_statfs_hpux(struct path *path, struct hpux_statfs *buf) asmlinkage long hpux_statfs(const char __user *pathname, struct hpux_statfs __user *buf) { - struct path path; - int error; - - error = user_path(pathname, &path); - if (!error) { - struct hpux_statfs tmp; - error = do_statfs_hpux(&path, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - path_put(&path); - } + struct kstatfs st; + int error = user_statfs(pathname, &st); + if (!error) + error = do_statfs_hpux(&st, buf); return error; } asmlinkage long hpux_fstatfs(unsigned int fd, struct hpux_statfs __user * buf) { - struct file *file; - struct hpux_statfs tmp; - int error; - - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - error = do_statfs_hpux(&file->f_path, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - fput(file); - out: + struct kstatfs st; + int error = fd_statfs(fd, &st); + if (!error) + error = do_statfs_hpux(&st, buf); return error; } diff --git a/fs/compat.c b/fs/compat.c index 691c3fd8ce1d..a071775f3bb3 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -262,35 +262,19 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs * */ asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf) { - struct path path; - int error; - - error = user_path(pathname, &path); - if (!error) { - struct kstatfs tmp; - error = vfs_statfs(&path, &tmp); - if (!error) - error = put_compat_statfs(buf, &tmp); - path_put(&path); - } + struct kstatfs tmp; + int error = user_statfs(pathname, &tmp); + if (!error) + error = put_compat_statfs(buf, &tmp); return error; } asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user *buf) { - struct file * file; struct kstatfs tmp; - int error; - - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - error = vfs_statfs(&file->f_path, &tmp); + int error = fd_statfs(fd, &tmp); if (!error) error = put_compat_statfs(buf, &tmp); - fput(file); -out: return error; } @@ -329,41 +313,29 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf) { - struct path path; - int error; - - if (sz != sizeof(*buf)) - return -EINVAL; - - error = user_path(pathname, &path); - if (!error) { - struct kstatfs tmp; - error = vfs_statfs(&path, &tmp); - if (!error) - error = put_compat_statfs64(buf, &tmp); - path_put(&path); - } - return error; -} - -asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf) -{ - struct file * file; struct kstatfs tmp; int error; if (sz != sizeof(*buf)) return -EINVAL; - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - error = vfs_statfs(&file->f_path, &tmp); + error = user_statfs(pathname, &tmp); + if (!error) + error = put_compat_statfs64(buf, &tmp); + return error; +} + +asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf) +{ + struct kstatfs tmp; + int error; + + if (sz != sizeof(*buf)) + return -EINVAL; + + error = fd_statfs(fd, &tmp); if (!error) error = put_compat_statfs64(buf, &tmp); - fput(file); -out: return error; } diff --git a/fs/statfs.c b/fs/statfs.c index 30ea8c8a996b..8244924dec55 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -73,149 +73,135 @@ int vfs_statfs(struct path *path, struct kstatfs *buf) } EXPORT_SYMBOL(vfs_statfs); -static int do_statfs_native(struct path *path, struct statfs *buf) +int user_statfs(const char __user *pathname, struct kstatfs *st) { - struct kstatfs st; - int retval; + struct path path; + int error = user_path(pathname, &path); + if (!error) { + error = vfs_statfs(&path, st); + path_put(&path); + } + return error; +} - retval = vfs_statfs(path, &st); - if (retval) - return retval; +int fd_statfs(int fd, struct kstatfs *st) +{ + struct file *file = fget(fd); + int error = -EBADF; + if (file) { + error = vfs_statfs(&file->f_path, st); + fput(file); + } + return error; +} - if (sizeof(*buf) == sizeof(st)) - memcpy(buf, &st, sizeof(st)); +static int do_statfs_native(struct kstatfs *st, struct statfs __user *p) +{ + struct statfs buf; + + if (sizeof(buf) == sizeof(*st)) + memcpy(&buf, st, sizeof(*st)); else { - if (sizeof buf->f_blocks == 4) { - if ((st.f_blocks | st.f_bfree | st.f_bavail | - st.f_bsize | st.f_frsize) & + if (sizeof buf.f_blocks == 4) { + if ((st->f_blocks | st->f_bfree | st->f_bavail | + st->f_bsize | st->f_frsize) & 0xffffffff00000000ULL) return -EOVERFLOW; /* * f_files and f_ffree may be -1; it's okay to stuff * that into 32 bits */ - if (st.f_files != -1 && - (st.f_files & 0xffffffff00000000ULL)) + if (st->f_files != -1 && + (st->f_files & 0xffffffff00000000ULL)) return -EOVERFLOW; - if (st.f_ffree != -1 && - (st.f_ffree & 0xffffffff00000000ULL)) + if (st->f_ffree != -1 && + (st->f_ffree & 0xffffffff00000000ULL)) return -EOVERFLOW; } - buf->f_type = st.f_type; - buf->f_bsize = st.f_bsize; - buf->f_blocks = st.f_blocks; - buf->f_bfree = st.f_bfree; - buf->f_bavail = st.f_bavail; - buf->f_files = st.f_files; - buf->f_ffree = st.f_ffree; - buf->f_fsid = st.f_fsid; - buf->f_namelen = st.f_namelen; - buf->f_frsize = st.f_frsize; - buf->f_flags = st.f_flags; - memset(buf->f_spare, 0, sizeof(buf->f_spare)); + buf.f_type = st->f_type; + buf.f_bsize = st->f_bsize; + buf.f_blocks = st->f_blocks; + buf.f_bfree = st->f_bfree; + buf.f_bavail = st->f_bavail; + buf.f_files = st->f_files; + buf.f_ffree = st->f_ffree; + buf.f_fsid = st->f_fsid; + buf.f_namelen = st->f_namelen; + buf.f_frsize = st->f_frsize; + buf.f_flags = st->f_flags; + memset(buf.f_spare, 0, sizeof(buf.f_spare)); } + if (copy_to_user(p, &buf, sizeof(buf))) + return -EFAULT; return 0; } -static int do_statfs64(struct path *path, struct statfs64 *buf) +static int do_statfs64(struct kstatfs *st, struct statfs64 __user *p) { - struct kstatfs st; - int retval; - - retval = vfs_statfs(path, &st); - if (retval) - return retval; - - if (sizeof(*buf) == sizeof(st)) - memcpy(buf, &st, sizeof(st)); + struct statfs64 buf; + if (sizeof(buf) == sizeof(*st)) + memcpy(&buf, st, sizeof(*st)); else { - buf->f_type = st.f_type; - buf->f_bsize = st.f_bsize; - buf->f_blocks = st.f_blocks; - buf->f_bfree = st.f_bfree; - buf->f_bavail = st.f_bavail; - buf->f_files = st.f_files; - buf->f_ffree = st.f_ffree; - buf->f_fsid = st.f_fsid; - buf->f_namelen = st.f_namelen; - buf->f_frsize = st.f_frsize; - buf->f_flags = st.f_flags; - memset(buf->f_spare, 0, sizeof(buf->f_spare)); + buf.f_type = st->f_type; + buf.f_bsize = st->f_bsize; + buf.f_blocks = st->f_blocks; + buf.f_bfree = st->f_bfree; + buf.f_bavail = st->f_bavail; + buf.f_files = st->f_files; + buf.f_ffree = st->f_ffree; + buf.f_fsid = st->f_fsid; + buf.f_namelen = st->f_namelen; + buf.f_frsize = st->f_frsize; + buf.f_flags = st->f_flags; + memset(buf.f_spare, 0, sizeof(buf.f_spare)); } + if (copy_to_user(p, &buf, sizeof(buf))) + return -EFAULT; return 0; } SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf) { - struct path path; - int error; - - error = user_path(pathname, &path); - if (!error) { - struct statfs tmp; - error = do_statfs_native(&path, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - path_put(&path); - } + struct kstatfs st; + int error = user_statfs(pathname, &st); + if (!error) + error = do_statfs_native(&st, buf); return error; } SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf) { - struct path path; - long error; - + struct kstatfs st; + int error; if (sz != sizeof(*buf)) return -EINVAL; - error = user_path(pathname, &path); - if (!error) { - struct statfs64 tmp; - error = do_statfs64(&path, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - path_put(&path); - } + error = user_statfs(pathname, &st); + if (!error) + error = do_statfs64(&st, buf); return error; } SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf) { - struct file *file; - struct statfs tmp; - int error; - - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - error = do_statfs_native(&file->f_path, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - fput(file); -out: + struct kstatfs st; + int error = fd_statfs(fd, &st); + if (!error) + error = do_statfs_native(&st, buf); return error; } SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf) { - struct file *file; - struct statfs64 tmp; + struct kstatfs st; int error; if (sz != sizeof(*buf)) return -EINVAL; - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - error = do_statfs64(&file->f_path, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - fput(file); -out: + error = fd_statfs(fd, &st); + if (!error) + error = do_statfs64(&st, buf); return error; } diff --git a/include/linux/fs.h b/include/linux/fs.h index bf5c3c896072..b7178b05cf3a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1874,6 +1874,8 @@ extern void drop_collected_mounts(struct vfsmount *); extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *, struct vfsmount *); extern int vfs_statfs(struct path *, struct kstatfs *); +extern int user_statfs(const char __user *, struct kstatfs *); +extern int fd_statfs(int, struct kstatfs *); extern int statfs_by_dentry(struct dentry *, struct kstatfs *); extern int freeze_super(struct super_block *super); extern int thaw_super(struct super_block *super); From 5fe0c2378884e68beb532f5890cc0e3539ac747b Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sat, 29 Jan 2011 18:43:25 +0530 Subject: [PATCH 666/706] exportfs: Return the minimum required handle size The exportfs encode handle function should return the minimum required handle size. This helps user to find out the handle size by passing 0 handle size in the first step and then redoing to the call again with the returned handle size value. Acked-by: Serge Hallyn Signed-off-by: Aneesh Kumar K.V Signed-off-by: Al Viro --- fs/btrfs/export.c | 8 ++++++-- fs/exportfs/expfs.c | 9 +++++++-- fs/fat/inode.c | 4 +++- fs/fuse/inode.c | 4 +++- fs/gfs2/export.c | 8 ++++++-- fs/isofs/export.c | 8 ++++++-- fs/ocfs2/export.c | 8 ++++++-- fs/reiserfs/inode.c | 7 ++++++- fs/udf/namei.c | 7 ++++++- fs/xfs/linux-2.6/xfs_export.c | 4 +++- include/linux/exportfs.h | 6 ++++-- mm/shmem.c | 4 +++- 12 files changed, 59 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index ff27d7a477b2..b4ffad859adb 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -21,9 +21,13 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, int len = *max_len; int type; - if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) || - (connectable && len < BTRFS_FID_SIZE_CONNECTABLE)) + if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) { + *max_len = BTRFS_FID_SIZE_CONNECTABLE; return 255; + } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) { + *max_len = BTRFS_FID_SIZE_NON_CONNECTABLE; + return 255; + } len = BTRFS_FID_SIZE_NON_CONNECTABLE; type = FILEID_BTRFS_WITHOUT_PARENT; diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 4b6825740dd5..cfe55731b6dc 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -320,9 +320,14 @@ static int export_encode_fh(struct dentry *dentry, struct fid *fid, struct inode * inode = dentry->d_inode; int len = *max_len; int type = FILEID_INO32_GEN; - - if (len < 2 || (connectable && len < 4)) + + if (connectable && (len < 4)) { + *max_len = 4; return 255; + } else if (len < 2) { + *max_len = 2; + return 255; + } len = 2; fid->i32.ino = inode->i_ino; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 86753fe10bd1..0e277ec4b612 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -757,8 +757,10 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable) struct inode *inode = de->d_inode; u32 ipos_h, ipos_m, ipos_l; - if (len < 5) + if (len < 5) { + *lenp = 5; return 255; /* no room */ + } ipos_h = MSDOS_I(inode)->i_pos >> 8; ipos_m = (MSDOS_I(inode)->i_pos & 0xf0) << 24; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 9e3f68cc1bd1..051b1a084528 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -637,8 +637,10 @@ static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, u64 nodeid; u32 generation; - if (*max_len < len) + if (*max_len < len) { + *max_len = len; return 255; + } nodeid = get_fuse_inode(inode)->nodeid; generation = inode->i_generation; diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index 9023db8184f9..b5a5e60df0d5 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -36,9 +36,13 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len, struct super_block *sb = inode->i_sb; struct gfs2_inode *ip = GFS2_I(inode); - if (*len < GFS2_SMALL_FH_SIZE || - (connectable && *len < GFS2_LARGE_FH_SIZE)) + if (connectable && (*len < GFS2_LARGE_FH_SIZE)) { + *len = GFS2_LARGE_FH_SIZE; return 255; + } else if (*len < GFS2_SMALL_FH_SIZE) { + *len = GFS2_SMALL_FH_SIZE; + return 255; + } fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32); fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF); diff --git a/fs/isofs/export.c b/fs/isofs/export.c index ed752cb38474..dd4687ff30d0 100644 --- a/fs/isofs/export.c +++ b/fs/isofs/export.c @@ -124,9 +124,13 @@ isofs_export_encode_fh(struct dentry *dentry, * offset of the inode and the upper 16 bits of fh32[1] to * hold the offset of the parent. */ - - if (len < 3 || (connectable && len < 5)) + if (connectable && (len < 5)) { + *max_len = 5; return 255; + } else if (len < 3) { + *max_len = 3; + return 255; + } len = 3; fh32[0] = ei->i_iget5_block; diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 5dbc3062b4fd..254652a9b542 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -197,8 +197,12 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len, dentry->d_name.len, dentry->d_name.name, fh, len, connectable); - if (len < 3 || (connectable && len < 6)) { - mlog(ML_ERROR, "fh buffer is too small for encoding\n"); + if (connectable && (len < 6)) { + *max_len = 6; + type = 255; + goto bail; + } else if (len < 3) { + *max_len = 3; type = 255; goto bail; } diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 0bae036831e2..1bba24bad820 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1593,8 +1593,13 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp, struct inode *inode = dentry->d_inode; int maxlen = *lenp; - if (maxlen < 3) + if (need_parent && (maxlen < 5)) { + *lenp = 5; return 255; + } else if (maxlen < 3) { + *lenp = 3; + return 255; + } data[0] = inode->i_ino; data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index b7c338d5e9df..f1dce848ef96 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -1286,8 +1286,13 @@ static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp, struct fid *fid = (struct fid *)fh; int type = FILEID_UDF_WITHOUT_PARENT; - if (len < 3 || (connectable && len < 5)) + if (connectable && (len < 5)) { + *lenp = 5; return 255; + } else if (len < 3) { + *lenp = 3; + return 255; + } *lenp = 3; fid->udf.block = location.logicalBlockNum; diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c index fc0114da7fdd..f4f878fc0083 100644 --- a/fs/xfs/linux-2.6/xfs_export.c +++ b/fs/xfs/linux-2.6/xfs_export.c @@ -89,8 +89,10 @@ xfs_fs_encode_fh( * seven combinations work. The real answer is "don't use v2". */ len = xfs_fileid_length(fileid_type); - if (*max_len < len) + if (*max_len < len) { + *max_len = len; return 255; + } *max_len = len; switch (fileid_type) { diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index 28028988c862..65afdfd31b7b 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -121,8 +121,10 @@ struct fid { * set, the encode_fh() should store sufficient information so that a good * attempt can be made to find not only the file but also it's place in the * filesystem. This typically means storing a reference to de->d_parent in - * the filehandle fragment. encode_fh() should return the number of bytes - * stored or a negative error code such as %-ENOSPC + * the filehandle fragment. encode_fh() should return the fileid_type on + * success and on error returns 255 (if the space needed to encode fh is + * greater than @max_len*4 bytes). On error @max_len contains the minimum + * size(in 4 byte unit) needed to encode the file handle. * * fh_to_dentry: * @fh_to_dentry is given a &struct super_block (@sb) and a file handle diff --git a/mm/shmem.c b/mm/shmem.c index 5ee67c990602..3437b65d6d6e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2144,8 +2144,10 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len, { struct inode *inode = dentry->d_inode; - if (*len < 3) + if (*len < 3) { + *len = 3; return 255; + } if (inode_unhashed(inode)) { /* Unfortunately insert_inode_hash is not idempotent, From f4cec35b0d4b90d96e3770a3d1e68ea882e7a7c8 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 18 Jan 2011 20:15:21 -0500 Subject: [PATCH 667/706] xen/mmu: Add the notion of identity (1-1) mapping. Our P2M tree structure is a three-level. On the leaf nodes we set the Machine Frame Number (MFN) of the PFN. What this means is that when one does: pfn_to_mfn(pfn), which is used when creating PTE entries, you get the real MFN of the hardware. When Xen sets up a guest it initially populates a array which has descending (or ascending) MFN values, as so: idx: 0, 1, 2 [0x290F, 0x290E, 0x290D, ..] so pfn_to_mfn(2)==0x290D. If you start, restart many guests that list starts looking quite random. We graft this structure on our P2M tree structure and stick in those MFN in the leafs. But for all other leaf entries, or for the top root, or middle one, for which there is a void entry, we assume it is "missing". So pfn_to_mfn(0xc0000)=INVALID_P2M_ENTRY. We add the possibility of setting 1-1 mappings on certain regions, so that: pfn_to_mfn(0xc0000)=0xc0000 The benefit of this is, that we can assume for non-RAM regions (think PCI BARs, or ACPI spaces), we can create mappings easily b/c we get the PFN value to match the MFN. For this to work efficiently we introduce one new page p2m_identity and allocate (via reserved_brk) any other pages we need to cover the sides (1GB or 4MB boundary violations). All entries in p2m_identity are set to INVALID_P2M_ENTRY type (Xen toolstack only recognizes that and MFNs, no other fancy value). On lookup we spot that the entry points to p2m_identity and return the identity value instead of dereferencing and returning INVALID_P2M_ENTRY. If the entry points to an allocated page, we just proceed as before and return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in appropriate functions (pfn_to_mfn). The reason for having the IDENTITY_FRAME_BIT instead of just returning the PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a non-identity pfn. To protect ourselves against we elect to set (and get) the IDENTITY_FRAME_BIT on all identity mapped PFNs. This simplistic diagram is used to explain the more subtle piece of code. There is also a digram of the P2M at the end that can help. Imagine your E820 looking as so: 1GB 2GB /-------------------+---------\/----\ /----------\ /---+-----\ | System RAM | Sys RAM ||ACPI| | reserved | | Sys RAM | \-------------------+---------/\----/ \----------/ \---+-----/ ^- 1029MB ^- 2001MB [1029MB = 263424 (0x40500), 2001MB = 512256 (0x7D100), 2048MB = 524288 (0x80000)] And dom0_mem=max:3GB,1GB is passed in to the guest, meaning memory past 1GB is actually not present (would have to kick the balloon driver to put it in). When we are told to set the PFNs for identity mapping (see patch: "xen/setup: Set identity mapping for non-RAM E820 and E820 gaps.") we pass in the start of the PFN and the end PFN (263424 and 512256 respectively). The first step is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page covers 512^2 of page estate (1GB) and in case the start or end PFN is not aligned on 512^2*PAGE_SIZE (1GB) we loop on aligned 1GB PFNs from start pfn to end pfn. We reserve_brk top leaf pages if they are missing (means they point to p2m_mid_missing). With the E820 example above, 263424 is not 1GB aligned so we allocate a reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000. Each entry in the allocate page is "missing" (points to p2m_missing). Next stage is to determine if we need to do a more granular boundary check on the 4MB (or 2MB depending on architecture) off the start and end pfn's. We check if the start pfn and end pfn violate that boundary check, and if so reserve_brk a middle (p2m[x][y]) leaf page. This way we have a much finer granularity of setting which PFNs are missing and which ones are identity. In our example 263424 and 512256 both fail the check so we reserve_brk two pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing" values) and assign them to p2m[1][2] and p2m[1][488] respectively. At this point we would at minimum reserve_brk one page, but could be up to three. Each call to set_phys_range_identity has at maximum a three page cost. If we were to query the P2M at this stage, all those entries from start PFN through end PFN (so 1029MB -> 2001MB) would return INVALID_P2M_ENTRY ("missing"). The next step is to walk from the start pfn to the end pfn setting the IDENTITY_FRAME_BIT on each PFN. This is done in 'set_phys_range_identity'. If we find that the middle leaf is pointing to p2m_missing we can swap it over to p2m_identity - this way covering 4MB (or 2MB) PFN space. At this point we do not need to worry about boundary aligment (so no need to reserve_brk a middle page, figure out which PFNs are "missing" and which ones are identity), as that has been done earlier. If we find that the middle leaf is not occupied by p2m_identity or p2m_missing, we dereference that page (which covers 512 PFNs) and set the appropriate PFN with IDENTITY_FRAME_BIT. In our example 263424 and 512256 end up there, and we set from p2m[1][2][256->511] and p2m[1][488][0->256] with IDENTITY_FRAME_BIT set. All other regions that are void (or not filled) either point to p2m_missing (considered missing) or have the default value of INVALID_P2M_ENTRY (also considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511] contain the INVALID_P2M_ENTRY value and are considered "missing." This is what the p2m ends up looking (for the E820 above) with this fabulous drawing: p2m /--------------\ /-----\ | &mfn_list[0],| /-----------------\ | 0 |------>| &mfn_list[1],| /---------------\ | ~0, ~0, .. | |-----| | ..., ~0, ~0 | | ~0, ~0, [x]---+----->| IDENTITY [@256] | | 1 |---\ \--------------/ | [p2m_identity]+\ | IDENTITY [@257] | |-----| \ | [p2m_identity]+\\ | .... | | 2 |--\ \-------------------->| ... | \\ \----------------/ |-----| \ \---------------/ \\ | 3 |\ \ \\ p2m_identity |-----| \ \-------------------->/---------------\ /-----------------\ | .. +->+ | [p2m_identity]+-->| ~0, ~0, ~0, ... | \-----/ / | [p2m_identity]+-->| ..., ~0 | / /---------------\ | .... | \-----------------/ / | IDENTITY[@0] | /-+-[x], ~0, ~0.. | / | IDENTITY[@256]|<----/ \---------------/ / | ~0, ~0, .... | | \---------------/ | p2m_missing p2m_missing /------------------\ /------------\ | [p2m_mid_missing]+---->| ~0, ~0, ~0 | | [p2m_mid_missing]+---->| ..., ~0 | \------------------/ \------------/ where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT) Reviewed-by: Ian Campbell [v5: Changed code to use ranges, added ASCII art] [v6: Rebased on top of xen->p2m code split] [v4: Squished patches in just this one] [v7: Added RESERVE_BRK for potentially allocated pages] [v8: Fixed alignment problem] [v9: Changed 1<<3X to 1< --- arch/x86/include/asm/xen/page.h | 8 +- arch/x86/xen/p2m.c | 236 +++++++++++++++++++++++++++++++- 2 files changed, 240 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 8ea977277c55..65fa4f26aa34 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -29,8 +29,10 @@ typedef struct xpaddr { /**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ #define INVALID_P2M_ENTRY (~0UL) -#define FOREIGN_FRAME_BIT (1UL<<31) +#define FOREIGN_FRAME_BIT (1UL<<(BITS_PER_LONG-1)) +#define IDENTITY_FRAME_BIT (1UL<<(BITS_PER_LONG-2)) #define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) +#define IDENTITY_FRAME(m) ((m) | IDENTITY_FRAME_BIT) /* Maximum amount of memory we can handle in a domain in pages */ #define MAX_DOMAIN_PAGES \ @@ -42,6 +44,8 @@ extern unsigned int machine_to_phys_order; extern unsigned long get_phys_to_machine(unsigned long pfn); extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); +extern unsigned long set_phys_range_identity(unsigned long pfn_s, + unsigned long pfn_e); extern int m2p_add_override(unsigned long mfn, struct page *page); extern int m2p_remove_override(struct page *page); @@ -58,7 +62,7 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn) mfn = get_phys_to_machine(pfn); if (mfn != INVALID_P2M_ENTRY) - mfn &= ~FOREIGN_FRAME_BIT; + mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT); return mfn; } diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index df4e36775339..809fe3536301 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -23,6 +23,129 @@ * P2M_PER_PAGE depends on the architecture, as a mfn is always * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to * 512 and 1024 entries respectively. + * + * In short, these structures contain the Machine Frame Number (MFN) of the PFN. + * + * However not all entries are filled with MFNs. Specifically for all other + * leaf entries, or for the top root, or middle one, for which there is a void + * entry, we assume it is "missing". So (for example) + * pfn_to_mfn(0x90909090)=INVALID_P2M_ENTRY. + * + * We also have the possibility of setting 1-1 mappings on certain regions, so + * that: + * pfn_to_mfn(0xc0000)=0xc0000 + * + * The benefit of this is, that we can assume for non-RAM regions (think + * PCI BARs, or ACPI spaces), we can create mappings easily b/c we + * get the PFN value to match the MFN. + * + * For this to work efficiently we have one new page p2m_identity and + * allocate (via reserved_brk) any other pages we need to cover the sides + * (1GB or 4MB boundary violations). All entries in p2m_identity are set to + * INVALID_P2M_ENTRY type (Xen toolstack only recognizes that and MFNs, + * no other fancy value). + * + * On lookup we spot that the entry points to p2m_identity and return the + * identity value instead of dereferencing and returning INVALID_P2M_ENTRY. + * If the entry points to an allocated page, we just proceed as before and + * return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in + * appropriate functions (pfn_to_mfn). + * + * The reason for having the IDENTITY_FRAME_BIT instead of just returning the + * PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a + * non-identity pfn. To protect ourselves against we elect to set (and get) the + * IDENTITY_FRAME_BIT on all identity mapped PFNs. + * + * This simplistic diagram is used to explain the more subtle piece of code. + * There is also a digram of the P2M at the end that can help. + * Imagine your E820 looking as so: + * + * 1GB 2GB + * /-------------------+---------\/----\ /----------\ /---+-----\ + * | System RAM | Sys RAM ||ACPI| | reserved | | Sys RAM | + * \-------------------+---------/\----/ \----------/ \---+-----/ + * ^- 1029MB ^- 2001MB + * + * [1029MB = 263424 (0x40500), 2001MB = 512256 (0x7D100), + * 2048MB = 524288 (0x80000)] + * + * And dom0_mem=max:3GB,1GB is passed in to the guest, meaning memory past 1GB + * is actually not present (would have to kick the balloon driver to put it in). + * + * When we are told to set the PFNs for identity mapping (see patch: "xen/setup: + * Set identity mapping for non-RAM E820 and E820 gaps.") we pass in the start + * of the PFN and the end PFN (263424 and 512256 respectively). The first step + * is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page + * covers 512^2 of page estate (1GB) and in case the start or end PFN is not + * aligned on 512^2*PAGE_SIZE (1GB) we loop on aligned 1GB PFNs from start pfn + * to end pfn. We reserve_brk top leaf pages if they are missing (means they + * point to p2m_mid_missing). + * + * With the E820 example above, 263424 is not 1GB aligned so we allocate a + * reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000. + * Each entry in the allocate page is "missing" (points to p2m_missing). + * + * Next stage is to determine if we need to do a more granular boundary check + * on the 4MB (or 2MB depending on architecture) off the start and end pfn's. + * We check if the start pfn and end pfn violate that boundary check, and if + * so reserve_brk a middle (p2m[x][y]) leaf page. This way we have a much finer + * granularity of setting which PFNs are missing and which ones are identity. + * In our example 263424 and 512256 both fail the check so we reserve_brk two + * pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing" + * values) and assign them to p2m[1][2] and p2m[1][488] respectively. + * + * At this point we would at minimum reserve_brk one page, but could be up to + * three. Each call to set_phys_range_identity has at maximum a three page + * cost. If we were to query the P2M at this stage, all those entries from + * start PFN through end PFN (so 1029MB -> 2001MB) would return + * INVALID_P2M_ENTRY ("missing"). + * + * The next step is to walk from the start pfn to the end pfn setting + * the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity. + * If we find that the middle leaf is pointing to p2m_missing we can swap it + * over to p2m_identity - this way covering 4MB (or 2MB) PFN space. At this + * point we do not need to worry about boundary aligment (so no need to + * reserve_brk a middle page, figure out which PFNs are "missing" and which + * ones are identity), as that has been done earlier. If we find that the + * middle leaf is not occupied by p2m_identity or p2m_missing, we dereference + * that page (which covers 512 PFNs) and set the appropriate PFN with + * IDENTITY_FRAME_BIT. In our example 263424 and 512256 end up there, and we + * set from p2m[1][2][256->511] and p2m[1][488][0->256] with + * IDENTITY_FRAME_BIT set. + * + * All other regions that are void (or not filled) either point to p2m_missing + * (considered missing) or have the default value of INVALID_P2M_ENTRY (also + * considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511] + * contain the INVALID_P2M_ENTRY value and are considered "missing." + * + * This is what the p2m ends up looking (for the E820 above) with this + * fabulous drawing: + * + * p2m /--------------\ + * /-----\ | &mfn_list[0],| /-----------------\ + * | 0 |------>| &mfn_list[1],| /---------------\ | ~0, ~0, .. | + * |-----| | ..., ~0, ~0 | | ~0, ~0, [x]---+----->| IDENTITY [@256] | + * | 1 |---\ \--------------/ | [p2m_identity]+\ | IDENTITY [@257] | + * |-----| \ | [p2m_identity]+\\ | .... | + * | 2 |--\ \-------------------->| ... | \\ \----------------/ + * |-----| \ \---------------/ \\ + * | 3 |\ \ \\ p2m_identity + * |-----| \ \-------------------->/---------------\ /-----------------\ + * | .. +->+ | [p2m_identity]+-->| ~0, ~0, ~0, ... | + * \-----/ / | [p2m_identity]+-->| ..., ~0 | + * / /---------------\ | .... | \-----------------/ + * / | IDENTITY[@0] | /-+-[x], ~0, ~0.. | + * / | IDENTITY[@256]|<----/ \---------------/ + * / | ~0, ~0, .... | + * | \---------------/ + * | + * p2m_missing p2m_missing + * /------------------\ /------------\ + * | [p2m_mid_missing]+---->| ~0, ~0, ~0 | + * | [p2m_mid_missing]+---->| ..., ~0 | + * \------------------/ \------------/ + * + * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT) */ #include @@ -59,9 +182,15 @@ static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE); + RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); +/* We might hit two boundary violations at the start and end, at max each + * boundary violation will require three middle nodes. */ +RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3); + static inline unsigned p2m_top_index(unsigned long pfn) { BUG_ON(pfn >= MAX_P2M_PFN); @@ -221,6 +350,9 @@ void __init xen_build_dynamic_phys_to_machine(void) p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_top_init(p2m_top); + p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_init(p2m_identity); + /* * The domain builder gives us a pre-constructed p2m array in * mfn_list for all the pages initially given to us, so we just @@ -272,6 +404,14 @@ unsigned long get_phys_to_machine(unsigned long pfn) mididx = p2m_mid_index(pfn); idx = p2m_index(pfn); + /* + * The INVALID_P2M_ENTRY is filled in both p2m_*identity + * and in p2m_*missing, so returning the INVALID_P2M_ENTRY + * would be wrong. + */ + if (p2m_top[topidx][mididx] == p2m_identity) + return IDENTITY_FRAME(pfn); + return p2m_top[topidx][mididx][idx]; } EXPORT_SYMBOL_GPL(get_phys_to_machine); @@ -341,9 +481,11 @@ static bool alloc_p2m(unsigned long pfn) p2m_top_mfn_p[topidx] = mid_mfn; } - if (p2m_top[topidx][mididx] == p2m_missing) { + if (p2m_top[topidx][mididx] == p2m_identity || + p2m_top[topidx][mididx] == p2m_missing) { /* p2m leaf page is missing */ unsigned long *p2m; + unsigned long *p2m_orig = p2m_top[topidx][mididx]; p2m = alloc_p2m_page(); if (!p2m) @@ -351,7 +493,7 @@ static bool alloc_p2m(unsigned long pfn) p2m_init(p2m); - if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing) + if (cmpxchg(&mid[mididx], p2m_orig, p2m) != p2m_orig) free_p2m_page(p2m); else mid_mfn[mididx] = virt_to_mfn(p2m); @@ -360,6 +502,82 @@ static bool alloc_p2m(unsigned long pfn) return true; } +bool __early_alloc_p2m(unsigned long pfn) +{ + unsigned topidx, mididx, idx; + + topidx = p2m_top_index(pfn); + mididx = p2m_mid_index(pfn); + idx = p2m_index(pfn); + + /* Pfff.. No boundary cross-over, lets get out. */ + if (!idx) + return false; + + WARN(p2m_top[topidx][mididx] == p2m_identity, + "P2M[%d][%d] == IDENTITY, should be MISSING (or alloced)!\n", + topidx, mididx); + + /* + * Could be done by xen_build_dynamic_phys_to_machine.. + */ + if (p2m_top[topidx][mididx] != p2m_missing) + return false; + + /* Boundary cross-over for the edges: */ + if (idx) { + unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE); + + p2m_init(p2m); + + p2m_top[topidx][mididx] = p2m; + + } + return idx != 0; +} +unsigned long set_phys_range_identity(unsigned long pfn_s, + unsigned long pfn_e) +{ + unsigned long pfn; + + if (unlikely(pfn_s >= MAX_P2M_PFN || pfn_e >= MAX_P2M_PFN)) + return 0; + + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) + return pfn_e - pfn_s; + + if (pfn_s > pfn_e) + return 0; + + for (pfn = (pfn_s & ~(P2M_MID_PER_PAGE * P2M_PER_PAGE - 1)); + pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE)); + pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE) + { + unsigned topidx = p2m_top_index(pfn); + if (p2m_top[topidx] == p2m_mid_missing) { + unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); + + p2m_mid_init(mid); + + p2m_top[topidx] = mid; + } + } + + __early_alloc_p2m(pfn_s); + __early_alloc_p2m(pfn_e); + + for (pfn = pfn_s; pfn < pfn_e; pfn++) + if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn))) + break; + + if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s), + "Identity mapping failed. We are %ld short of 1-1 mappings!\n", + (pfn_e - pfn_s) - (pfn - pfn_s))) + printk(KERN_DEBUG "1-1 mapping on %lx->%lx\n", pfn_s, pfn); + + return pfn - pfn_s; +} + /* Try to install p2m mapping; fail if intermediate bits missing */ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) { @@ -378,6 +596,20 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) mididx = p2m_mid_index(pfn); idx = p2m_index(pfn); + /* For sparse holes were the p2m leaf has real PFN along with + * PCI holes, stick in the PFN as the MFN value. + */ + if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) { + if (p2m_top[topidx][mididx] == p2m_identity) + return true; + + /* Swap over from MISSING to IDENTITY if needed. */ + if (p2m_top[topidx][mididx] == p2m_missing) { + p2m_top[topidx][mididx] = p2m_identity; + return true; + } + } + if (p2m_top[topidx][mididx] == p2m_missing) return mfn == INVALID_P2M_ENTRY; From fb38923ead10aa8a28db191548e176e8856614d7 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 5 Jan 2011 15:46:31 -0500 Subject: [PATCH 668/706] xen/mmu: Set _PAGE_IOMAP if PFN is an identity PFN. If we find that the PFN is within the P2M as an identity PFN make sure to tack on the _PAGE_IOMAP flag. Reviewed-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 0180ae88307b..9c9e07615139 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -416,8 +416,12 @@ static pteval_t pte_pfn_to_mfn(pteval_t val) if (val & _PAGE_PRESENT) { unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; pteval_t flags = val & PTE_FLAGS_MASK; - unsigned long mfn = pfn_to_mfn(pfn); + unsigned long mfn; + if (!xen_feature(XENFEAT_auto_translated_physmap)) + mfn = get_phys_to_machine(pfn); + else + mfn = pfn; /* * If there's no mfn for the pfn, then just create an * empty non-present pte. Unfortunately this loses @@ -427,8 +431,18 @@ static pteval_t pte_pfn_to_mfn(pteval_t val) if (unlikely(mfn == INVALID_P2M_ENTRY)) { mfn = 0; flags = 0; + } else { + /* + * Paramount to do this test _after_ the + * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY & + * IDENTITY_FRAME_BIT resolves to true. + */ + mfn &= ~FOREIGN_FRAME_BIT; + if (mfn & IDENTITY_FRAME_BIT) { + mfn &= ~IDENTITY_FRAME_BIT; + flags |= _PAGE_IOMAP; + } } - val = ((pteval_t)mfn << PAGE_SHIFT) | flags; } From c7617798771ad588d585986d896197c04b737621 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 18 Jan 2011 20:17:10 -0500 Subject: [PATCH 669/706] xen/mmu: WARN_ON when racing to swap middle leaf. The initial bootup code uses set_phys_to_machine quite a lot, and after bootup it would be used by the balloon driver. The balloon driver does have mutex lock so this should not be necessary - but just in case, add a WARN_ON if we do hit this scenario. If we do fail this, it is OK to continue as there is a backup mechanism (VM_IO) that can bypass the P2M and still set the _PAGE_IOMAP flags. [v2: Change from WARN to BUG_ON] [v3: Rebased on top of xen->p2m code split] [v4: Change from BUG_ON to WARN] Reviewed-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 809fe3536301..4631cf99e714 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -605,7 +605,8 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) /* Swap over from MISSING to IDENTITY if needed. */ if (p2m_top[topidx][mididx] == p2m_missing) { - p2m_top[topidx][mididx] = p2m_identity; + WARN_ON(cmpxchg(&p2m_top[topidx][mididx], p2m_missing, + p2m_identity) != p2m_missing); return true; } } From 68df0da7f42be6ae017fe9f48ac414c43a7b9d32 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 1 Feb 2011 17:15:30 -0500 Subject: [PATCH 670/706] xen/setup: Set identity mapping for non-RAM E820 and E820 gaps. We walk the E820 region and start at 0 (for PV guests we start at ISA_END_ADDRESS) and skip any E820 RAM regions. For all other regions and as well the gaps we set them to be identity mappings. The reasons we do not want to set the identity mapping from 0-> ISA_END_ADDRESS when running as PV is b/c that the kernel would try to read DMI information and fail (no permissions to read that). There is a lot of gnarly code to deal with that weird region so we won't try to do a cleanup in this patch. This code ends up calling 'set_phys_to_identity' with the start and end PFN of the the E820 that are non-RAM or have gaps. On 99% of machines that means one big region right underneath the 4GB mark. Usually starts at 0xc0000 (or 0x80000) and goes to 0x100000. [v2: Fix for E820 crossing 1MB region and clamp the start] [v3: Squshed in code that does this over ranges] [v4: Moved the comment to the correct spot] [v5: Use the "raw" E820 from the hypervisor] [v6: Added Review-by tag] Reviewed-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 52 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 7201800e55a4..54d93791ddb9 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -143,12 +143,55 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn, return released; } +static unsigned long __init xen_set_identity(const struct e820entry *list, + ssize_t map_size) +{ + phys_addr_t last = xen_initial_domain() ? 0 : ISA_END_ADDRESS; + phys_addr_t start_pci = last; + const struct e820entry *entry; + unsigned long identity = 0; + int i; + + for (i = 0, entry = list; i < map_size; i++, entry++) { + phys_addr_t start = entry->addr; + phys_addr_t end = start + entry->size; + + if (start < last) + start = last; + + if (end <= start) + continue; + + /* Skip over the 1MB region. */ + if (last > end) + continue; + + if (entry->type == E820_RAM) { + if (start > start_pci) + identity += set_phys_range_identity( + PFN_UP(start_pci), PFN_DOWN(start)); + + /* Without saving 'last' we would gooble RAM too + * at the end of the loop. */ + last = end; + start_pci = end; + continue; + } + start_pci = min(start, start_pci); + last = end; + } + if (last > start_pci) + identity += set_phys_range_identity( + PFN_UP(start_pci), PFN_DOWN(last)); + return identity; +} /** * machine_specific_memory_setup - Hook for machine specific memory setup. **/ char * __init xen_memory_setup(void) { static struct e820entry map[E820MAX] __initdata; + static struct e820entry map_raw[E820MAX] __initdata; unsigned long max_pfn = xen_start_info->nr_pages; unsigned long long mem_end; @@ -156,6 +199,7 @@ char * __init xen_memory_setup(void) struct xen_memory_map memmap; unsigned long extra_pages = 0; unsigned long extra_limit; + unsigned long identity_pages = 0; int i; int op; @@ -181,6 +225,7 @@ char * __init xen_memory_setup(void) } BUG_ON(rc); + memcpy(map_raw, map, sizeof(map)); e820.nr_map = 0; xen_extra_mem_start = mem_end; for (i = 0; i < memmap.nr_entries; i++) { @@ -251,6 +296,13 @@ char * __init xen_memory_setup(void) xen_add_extra_mem(extra_pages); + /* + * Set P2M for all non-RAM pages and E820 gaps to be identity + * type PFNs. We supply it with the non-sanitized version + * of the E820. + */ + identity_pages = xen_set_identity(map_raw, memmap.nr_entries); + printk(KERN_INFO "Set %ld page(s) to 1-1 mapping.\n", identity_pages); return "Xen"; } From 2222e71bd6eff7b2ad026d4ee663b6327c5a49f5 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 22 Dec 2010 08:57:30 -0500 Subject: [PATCH 671/706] xen/debugfs: Add 'p2m' file for printing out the P2M layout. We walk over the whole P2M tree and construct a simplified view of which PFN regions belong to what level and what type they are. Only enabled if CONFIG_XEN_DEBUG_FS is set. [v2: UNKN->UNKNOWN, use uninitialized_var] [v3: Rebased on top of mmu->p2m code split] [v4: Fixed the else if] Reviewed-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/page.h | 3 ++ arch/x86/xen/mmu.c | 14 ++++++ arch/x86/xen/p2m.c | 78 +++++++++++++++++++++++++++++++++ 3 files changed, 95 insertions(+) diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 65fa4f26aa34..78ebbeb88d9c 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -52,6 +52,9 @@ extern int m2p_remove_override(struct page *page); extern struct page *m2p_find_override(unsigned long mfn); extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); +#ifdef CONFIG_XEN_DEBUG_FS +extern int p2m_dump_show(struct seq_file *m, void *v); +#endif static inline unsigned long pfn_to_mfn(unsigned long pfn) { unsigned long mfn; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 9c9e07615139..b13b6ca9052a 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -2367,6 +2368,18 @@ EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); #ifdef CONFIG_XEN_DEBUG_FS +static int p2m_dump_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, p2m_dump_show, NULL); +} + +static const struct file_operations p2m_dump_fops = { + .open = p2m_dump_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static struct dentry *d_mmu_debug; static int __init xen_mmu_debugfs(void) @@ -2422,6 +2435,7 @@ static int __init xen_mmu_debugfs(void) debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug, &mmu_stats.prot_commit_batched); + debugfs_create_file("p2m", 0600, d_mmu_debug, NULL, &p2m_dump_fops); return 0; } fs_initcall(xen_mmu_debugfs); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 4631cf99e714..65f21f4b3962 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -153,6 +153,7 @@ #include #include #include +#include #include #include @@ -758,3 +759,80 @@ unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn) return ret; } EXPORT_SYMBOL_GPL(m2p_find_override_pfn); + +#ifdef CONFIG_XEN_DEBUG_FS + +int p2m_dump_show(struct seq_file *m, void *v) +{ + static const char * const level_name[] = { "top", "middle", + "entry", "abnormal" }; + static const char * const type_name[] = { "identity", "missing", + "pfn", "abnormal"}; +#define TYPE_IDENTITY 0 +#define TYPE_MISSING 1 +#define TYPE_PFN 2 +#define TYPE_UNKNOWN 3 + unsigned long pfn, prev_pfn_type = 0, prev_pfn_level = 0; + unsigned int uninitialized_var(prev_level); + unsigned int uninitialized_var(prev_type); + + if (!p2m_top) + return 0; + + for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn++) { + unsigned topidx = p2m_top_index(pfn); + unsigned mididx = p2m_mid_index(pfn); + unsigned idx = p2m_index(pfn); + unsigned lvl, type; + + lvl = 4; + type = TYPE_UNKNOWN; + if (p2m_top[topidx] == p2m_mid_missing) { + lvl = 0; type = TYPE_MISSING; + } else if (p2m_top[topidx] == NULL) { + lvl = 0; type = TYPE_UNKNOWN; + } else if (p2m_top[topidx][mididx] == NULL) { + lvl = 1; type = TYPE_UNKNOWN; + } else if (p2m_top[topidx][mididx] == p2m_identity) { + lvl = 1; type = TYPE_IDENTITY; + } else if (p2m_top[topidx][mididx] == p2m_missing) { + lvl = 1; type = TYPE_MISSING; + } else if (p2m_top[topidx][mididx][idx] == 0) { + lvl = 2; type = TYPE_UNKNOWN; + } else if (p2m_top[topidx][mididx][idx] == IDENTITY_FRAME(pfn)) { + lvl = 2; type = TYPE_IDENTITY; + } else if (p2m_top[topidx][mididx][idx] == INVALID_P2M_ENTRY) { + lvl = 2; type = TYPE_MISSING; + } else if (p2m_top[topidx][mididx][idx] == pfn) { + lvl = 2; type = TYPE_PFN; + } else if (p2m_top[topidx][mididx][idx] != pfn) { + lvl = 2; type = TYPE_PFN; + } + if (pfn == 0) { + prev_level = lvl; + prev_type = type; + } + if (pfn == MAX_DOMAIN_PAGES-1) { + lvl = 3; + type = TYPE_UNKNOWN; + } + if (prev_type != type) { + seq_printf(m, " [0x%lx->0x%lx] %s\n", + prev_pfn_type, pfn, type_name[prev_type]); + prev_pfn_type = pfn; + prev_type = type; + } + if (prev_level != lvl) { + seq_printf(m, " [0x%lx->0x%lx] level %s\n", + prev_pfn_level, pfn, level_name[prev_level]); + prev_pfn_level = pfn; + prev_level = lvl; + } + } + return 0; +#undef TYPE_IDENTITY +#undef TYPE_MISSING +#undef TYPE_PFN +#undef TYPE_UNKNOWN +} +#endif From fc25151d9ac7d809239fe68de0a1490b504bb94a Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 23 Dec 2010 16:25:29 -0500 Subject: [PATCH 672/706] xen/debug: WARN_ON when identity PFN has no _PAGE_IOMAP flag set. Only enabled if XEN_DEBUG is enabled. We print a warning when: pfn_to_mfn(pfn) == pfn, but no VM_IO (_PAGE_IOMAP) flag set (and pfn is an identity mapped pfn) pfn_to_mfn(pfn) != pfn, and VM_IO flag is set. (ditto, pfn is an identity mapped pfn) [v2: Make it dependent on CONFIG_XEN_DEBUG instead of ..DEBUG_FS] [v3: Fix compiler warning] Reviewed-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/Kconfig | 8 ++++++++ arch/x86/xen/mmu.c | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 5b54892e4bc3..e4343fe488ed 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -48,3 +48,11 @@ config XEN_DEBUG_FS help Enable statistics output and various tuning options in debugfs. Enabling this option may incur a significant performance overhead. + +config XEN_DEBUG + bool "Enable Xen debug checks" + depends on XEN + default n + help + Enable various WARN_ON checks in the Xen MMU code. + Enabling this option WILL incur a significant performance overhead. diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index b13b6ca9052a..0c376a2d9f98 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -547,6 +547,41 @@ pte_t xen_make_pte(pteval_t pte) } PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); +#ifdef CONFIG_XEN_DEBUG +pte_t xen_make_pte_debug(pteval_t pte) +{ + phys_addr_t addr = (pte & PTE_PFN_MASK); + phys_addr_t other_addr; + bool io_page = false; + pte_t _pte; + + if (pte & _PAGE_IOMAP) + io_page = true; + + _pte = xen_make_pte(pte); + + if (!addr) + return _pte; + + if (io_page && + (xen_initial_domain() || addr >= ISA_END_ADDRESS)) { + other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT; + WARN(addr != other_addr, + "0x%lx is using VM_IO, but it is 0x%lx!\n", + (unsigned long)addr, (unsigned long)other_addr); + } else { + pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP; + other_addr = (_pte.pte & PTE_PFN_MASK); + WARN((addr == other_addr) && (!io_page) && (!iomap_set), + "0x%lx is missing VM_IO (and wasn't fixed)!\n", + (unsigned long)addr); + } + + return _pte; +} +PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug); +#endif + pgd_t xen_make_pgd(pgdval_t pgd) { pgd = pte_pfn_to_mfn(pgd); @@ -1957,6 +1992,9 @@ __init void xen_ident_map_ISA(void) static __init void xen_post_allocator_init(void) { +#ifdef CONFIG_XEN_DEBUG + pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug); +#endif pv_mmu_ops.set_pte = xen_set_pte; pv_mmu_ops.set_pmd = xen_set_pmd; pv_mmu_ops.set_pud = xen_set_pud; From 146c4e511717e581065800938537b276173d8548 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 14 Jan 2011 17:55:44 -0500 Subject: [PATCH 673/706] xen/m2p: No need to catch exceptions when we know that there is no RAM .. beyound what we think is the end of memory. However there might be more System RAM - but assigned to a guest. Hence jump to the M2P override check and consult. [v1: Added Review-by tag] Reviewed-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/page.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 78ebbeb88d9c..195707060493 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -85,6 +85,10 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) if (xen_feature(XENFEAT_auto_translated_physmap)) return mfn; + if (unlikely((mfn >> machine_to_phys_order) != 0)) { + pfn = ~0; + goto try_override; + } pfn = 0; /* * The array access can fail (e.g., device space beyond end of RAM). @@ -92,7 +96,7 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) * but we must handle the fault without crashing! */ __get_user(pfn, &machine_to_phys_mapping[mfn]); - +try_override: /* * If this appears to be a foreign mfn (because the pfn * doesn't map back to the mfn), then check the local override From 706cc9d2a4cb9b03217e15b0bb3d117f4d5109ee Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Wed, 2 Feb 2011 18:32:59 +0000 Subject: [PATCH 674/706] xen/m2p: Check whether the MFN has IDENTITY_FRAME bit set.. If there is no proper PFN value in the M2P for the MFN (so we get 0xFFFFF.. or 0x55555, or 0x0), we should consult the M2P override to see if there is an entry for this. [Note: we also consult the M2P override if the MFN is past our machine_to_phys size]. We consult the P2M with the PFN. In case the returned MFN is one of the special values: 0xFFF.., 0x5555 (which signify that the MFN can be either "missing" or it belongs to DOMID_IO) or the p2m(m2p(mfn)) != mfn, we check the M2P override. If we fail the M2P override check, we reset the PFN value to INVALID_P2M_ENTRY. Next we try to find the MFN in the P2M using the MFN value (not the PFN value) and if found, we know that this MFN is an identity value and return it as so. Otherwise we have exhausted all the posibilities and we return the PFN, which at this stage can either be a real PFN value found in the machine_to_phys.. array, or INVALID_P2M_ENTRY value. [v1: Added Review-by tag] Reviewed-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/page.h | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 195707060493..c61934fbf22a 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -81,6 +81,7 @@ static inline int phys_to_machine_mapping_valid(unsigned long pfn) static inline unsigned long mfn_to_pfn(unsigned long mfn) { unsigned long pfn; + int ret = 0; if (xen_feature(XENFEAT_auto_translated_physmap)) return mfn; @@ -95,15 +96,29 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) * In such cases it doesn't matter what we return (we return garbage), * but we must handle the fault without crashing! */ - __get_user(pfn, &machine_to_phys_mapping[mfn]); + ret = __get_user(pfn, &machine_to_phys_mapping[mfn]); try_override: - /* - * If this appears to be a foreign mfn (because the pfn - * doesn't map back to the mfn), then check the local override - * table to see if there's a better pfn to use. + /* ret might be < 0 if there are no entries in the m2p for mfn */ + if (ret < 0) + pfn = ~0; + else if (get_phys_to_machine(pfn) != mfn) + /* + * If this appears to be a foreign mfn (because the pfn + * doesn't map back to the mfn), then check the local override + * table to see if there's a better pfn to use. + * + * m2p_find_override_pfn returns ~0 if it doesn't find anything. + */ + pfn = m2p_find_override_pfn(mfn, ~0); + + /* + * pfn is ~0 if there are no entries in the m2p for mfn or if the + * entry doesn't map back to the mfn and m2p_override doesn't have a + * valid entry for it. */ - if (get_phys_to_machine(pfn) != mfn) - pfn = m2p_find_override_pfn(mfn, pfn); + if (pfn == ~0 && + get_phys_to_machine(mfn) == IDENTITY_FRAME(mfn)) + pfn = mfn; return pfn; } From 6e0aa9f8a8190e0879a29bd67aa606b51734a122 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 14 Mar 2011 10:34:35 +0100 Subject: [PATCH 675/706] futex: Deobfuscate handle_futex_death() handle_futex_death() uses futex_atomic_cmpxchg_inatomic() without disabling page faults. That's ok, but totally non obvious. We don't hold locks so we actually can and want to fault here, because the get_user() before futex_atomic_cmpxchg_inatomic() does not guarantee a R/W mapping. We could just add a big fat comment to explain this, but actually changing the code so that the functionality is entirely clear is better. Use the helper function which disables page faults around the futex_atomic_cmpxchg_inatomic() and handle a fault with a call to fault_in_user_writeable() as all other places in the futex code do as well. Pointed-out-by: Linus Torvalds Signed-off-by: Thomas Gleixner Acked-by: Darren Hart Cc: Michel Lespinasse Cc: Peter Zijlstra Cc: Matt Turner Cc: Russell King Cc: David Howells Cc: Tony Luck Cc: Michal Simek Cc: Ralf Baechle Cc: "James E.J. Bottomley" Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: Paul Mundt Cc: "David S. Miller" Cc: Chris Metcalf LKML-Reference: Signed-off-by: Thomas Gleixner --- kernel/futex.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index c6bef6e404fe..e9251d934f7d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2458,9 +2458,20 @@ retry: * userspace. */ mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; - if (futex_atomic_cmpxchg_inatomic(&nval, uaddr, uval, mval)) - return -1; - + /* + * We are not holding a lock here, but we want to have + * the pagefault_disable/enable() protection because + * we want to handle the fault gracefully. If the + * access fails we try to fault in the futex with R/W + * verification via get_user_pages. get_user() above + * does not guarantee R/W access. If that fails we + * give up and leave the futex locked. + */ + if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) { + if (fault_in_user_writeable(uaddr)) + return -1; + goto retry; + } if (nval != uval) goto retry; From 07d5ecae2940ddd77746e2fb597dcf57d3c2e277 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 14 Mar 2011 20:00:30 +0100 Subject: [PATCH 676/706] arm: Remove bogus comment in futex_atomic_cmpxchg_inatomic() commit 522d7dec(futex: Remove redundant pagefault_disable in futex_atomic_cmpxchg_inatomic()) added a bogus comment. /* Note that preemption is disabled by futex_atomic_cmpxchg_inatomic * call sites. */ Bogus in two aspects: 1) pagefault_disable != preempt_disable even if the mechanism we use is the same 2) we have a call site which deliberately does not disable pagefaults as it wants the possible fault to be handled - though that has been changed for consistency reasons now. Sigh. I really should have seen that when committing the above. :( Catched-by-and-rightfully-ranted-at-by: Linus Torvalds Signed-off-by: Thomas Gleixner LKML-Reference: Cc: Michel Lespinasse Cc: Darren Hart --- arch/arm/include/asm/futex.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h index 0e29d8e6a5c2..199a6b6de7f4 100644 --- a/arch/arm/include/asm/futex.h +++ b/arch/arm/include/asm/futex.h @@ -97,9 +97,6 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; - /* Note that preemption is disabled by futex_atomic_cmpxchg_inatomic - * call sites. */ - __asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n" "1: " T(ldr) " %1, [%4]\n" " teq %1, %2\n" From f52e0c11305aa09ed56cad97ffc8f0cdc3d78b5d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 14 Mar 2011 18:56:51 -0400 Subject: [PATCH 677/706] New AT_... flag: AT_EMPTY_PATH For name_to_handle_at(2) we'll want both ...at()-style syscall that would be usable for non-directory descriptors (with empty relative pathname). Introduce new flag (AT_EMPTY_PATH) to deal with that and corresponding LOOKUP_EMPTY; teach user_path_at() and path_init() to deal with the latter. Signed-off-by: Al Viro --- fs/namei.c | 29 +++++++++++++++++++---------- include/linux/fcntl.h | 1 + include/linux/namei.h | 1 + 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index abc8d2df121c..83e92bab79a6 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -136,7 +136,7 @@ static int do_getname(const char __user *filename, char *page) return retval; } -char * getname(const char __user * filename) +static char *getname_flags(const char __user * filename, int flags) { char *tmp, *result; @@ -147,14 +147,21 @@ char * getname(const char __user * filename) result = tmp; if (retval < 0) { - __putname(tmp); - result = ERR_PTR(retval); + if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) { + __putname(tmp); + result = ERR_PTR(retval); + } } } audit_getname(result); return result; } +char *getname(const char __user * filename) +{ + return getname_flags(filename, 0); +} + #ifdef CONFIG_AUDITSYSCALL void putname(const char *name) { @@ -1544,13 +1551,15 @@ static int path_init(int dfd, const char *name, unsigned int flags, dentry = file->f_path.dentry; - retval = -ENOTDIR; - if (!S_ISDIR(dentry->d_inode->i_mode)) - goto fput_fail; + if (*name) { + retval = -ENOTDIR; + if (!S_ISDIR(dentry->d_inode->i_mode)) + goto fput_fail; - retval = file_permission(file, MAY_EXEC); - if (retval) - goto fput_fail; + retval = file_permission(file, MAY_EXEC); + if (retval) + goto fput_fail; + } nd->path = file->f_path; if (flags & LOOKUP_RCU) { @@ -1759,7 +1768,7 @@ int user_path_at(int dfd, const char __user *name, unsigned flags, struct path *path) { struct nameidata nd; - char *tmp = getname(name); + char *tmp = getname_flags(name, flags); int err = PTR_ERR(tmp); if (!IS_ERR(tmp)) { diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h index a562fa5fb4e3..f550f894ba15 100644 --- a/include/linux/fcntl.h +++ b/include/linux/fcntl.h @@ -46,6 +46,7 @@ unlinking file. */ #define AT_SYMLINK_FOLLOW 0x400 /* Follow symbolic links. */ #define AT_NO_AUTOMOUNT 0x800 /* Suppress terminal automount traversal */ +#define AT_EMPTY_PATH 0x1000 /* Allow empty relative pathname */ #ifdef __KERNEL__ diff --git a/include/linux/namei.h b/include/linux/namei.h index 83cd6e5cd7dc..9c8603872c36 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -64,6 +64,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; #define LOOKUP_JUMPED 0x1000 #define LOOKUP_ROOT 0x2000 +#define LOOKUP_EMPTY 0x4000 extern int user_path_at(int, const char __user *, unsigned, struct path *); From 990d6c2d7aee921e3bce22b2d6a750fd552262be Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sat, 29 Jan 2011 18:43:26 +0530 Subject: [PATCH 678/706] vfs: Add name to file handle conversion support The syscall also return mount id which can be used to lookup file system specific information such as uuid in /proc//mountinfo Signed-off-by: Aneesh Kumar K.V Signed-off-by: Al Viro --- fs/Kconfig | 2 +- fs/Makefile | 2 + fs/fhandle.c | 107 +++++++++++++++++++++++++++++++++++++++ include/linux/exportfs.h | 3 ++ include/linux/fs.h | 7 +++ include/linux/syscalls.h | 5 +- init/Kconfig | 12 +++++ kernel/sys_ni.c | 3 ++ 8 files changed, 139 insertions(+), 2 deletions(-) create mode 100644 fs/fhandle.c diff --git a/fs/Kconfig b/fs/Kconfig index 3db9caa57edc..7cb53aafac1e 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -47,7 +47,7 @@ config FS_POSIX_ACL def_bool n config EXPORTFS - tristate + bool config FILE_LOCKING bool "Enable POSIX file locking API" if EXPERT diff --git a/fs/Makefile b/fs/Makefile index a7f7cef0c0c8..ba01202844c5 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -48,6 +48,8 @@ obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ obj-$(CONFIG_GENERIC_ACL) += generic_acl.o +obj-$(CONFIG_FHANDLE) += fhandle.o + obj-y += quota/ obj-$(CONFIG_PROC_FS) += proc/ diff --git a/fs/fhandle.c b/fs/fhandle.c new file mode 100644 index 000000000000..9f79e743a840 --- /dev/null +++ b/fs/fhandle.c @@ -0,0 +1,107 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static long do_sys_name_to_handle(struct path *path, + struct file_handle __user *ufh, + int __user *mnt_id) +{ + long retval; + struct file_handle f_handle; + int handle_dwords, handle_bytes; + struct file_handle *handle = NULL; + + /* + * We need t make sure wether the file system + * support decoding of the file handle + */ + if (!path->mnt->mnt_sb->s_export_op || + !path->mnt->mnt_sb->s_export_op->fh_to_dentry) + return -EOPNOTSUPP; + + if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) + return -EFAULT; + + if (f_handle.handle_bytes > MAX_HANDLE_SZ) + return -EINVAL; + + handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes, + GFP_KERNEL); + if (!handle) + return -ENOMEM; + + /* convert handle size to multiple of sizeof(u32) */ + handle_dwords = f_handle.handle_bytes >> 2; + + /* we ask for a non connected handle */ + retval = exportfs_encode_fh(path->dentry, + (struct fid *)handle->f_handle, + &handle_dwords, 0); + handle->handle_type = retval; + /* convert handle size to bytes */ + handle_bytes = handle_dwords * sizeof(u32); + handle->handle_bytes = handle_bytes; + if ((handle->handle_bytes > f_handle.handle_bytes) || + (retval == 255) || (retval == -ENOSPC)) { + /* As per old exportfs_encode_fh documentation + * we could return ENOSPC to indicate overflow + * But file system returned 255 always. So handle + * both the values + */ + /* + * set the handle size to zero so we copy only + * non variable part of the file_handle + */ + handle_bytes = 0; + retval = -EOVERFLOW; + } else + retval = 0; + /* copy the mount id */ + if (copy_to_user(mnt_id, &path->mnt->mnt_id, sizeof(*mnt_id)) || + copy_to_user(ufh, handle, + sizeof(struct file_handle) + handle_bytes)) + retval = -EFAULT; + kfree(handle); + return retval; +} + +/** + * sys_name_to_handle_at: convert name to handle + * @dfd: directory relative to which name is interpreted if not absolute + * @name: name that should be converted to handle. + * @handle: resulting file handle + * @mnt_id: mount id of the file system containing the file + * @flag: flag value to indicate whether to follow symlink or not + * + * @handle->handle_size indicate the space available to store the + * variable part of the file handle in bytes. If there is not + * enough space, the field is updated to return the minimum + * value required. + */ +SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, + struct file_handle __user *, handle, int __user *, mnt_id, + int, flag) +{ + struct path path; + int lookup_flags; + int err; + + if ((flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) + return -EINVAL; + + lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0; + if (flag & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + err = user_path_at(dfd, name, lookup_flags, &path); + if (!err) { + err = do_sys_name_to_handle(&path, handle, mnt_id); + path_put(&path); + } + return err; +} diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index 65afdfd31b7b..33a42f24b275 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -8,6 +8,9 @@ struct inode; struct super_block; struct vfsmount; +/* limit the handle size to NFSv4 handle size now */ +#define MAX_HANDLE_SZ 128 + /* * The fileid_type identifies how the file within the filesystem is encoded. * In theory this is freely set and parsed by the filesystem, but we try to diff --git a/include/linux/fs.h b/include/linux/fs.h index b7178b05cf3a..3f64630c0e10 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -978,6 +978,13 @@ struct file { #endif }; +struct file_handle { + __u32 handle_bytes; + int handle_type; + /* file identifier */ + unsigned char f_handle[0]; +}; + #define get_file(x) atomic_long_inc(&(x)->f_count) #define fput_atomic(x) atomic_long_add_unless(&(x)->f_count, -1, 1) #define file_count(x) atomic_long_read(&(x)->f_count) diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 98664db1be47..970112613fb4 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -62,6 +62,7 @@ struct robust_list_head; struct getcpu_cache; struct old_linux_dirent; struct perf_event_attr; +struct file_handle; #include #include @@ -832,5 +833,7 @@ asmlinkage long sys_mmap_pgoff(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff); asmlinkage long sys_old_mmap(struct mmap_arg_struct __user *arg); - +asmlinkage long sys_name_to_handle_at(int dfd, const char __user *name, + struct file_handle __user *handle, + int __user *mnt_id, int flag); #endif diff --git a/init/Kconfig b/init/Kconfig index be788c0957d4..e72fa17fe559 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -287,6 +287,18 @@ config BSD_PROCESS_ACCT_V3 for processing it. A preliminary version of these tools is available at . +config FHANDLE + bool "open by fhandle syscalls" + select EXPORTFS + help + If you say Y here, a user level program will be able to map + file names to handle and then later use the handle for + different file system operations. This is useful in implementing + userspace file servers, which now track files using handles instead + of names. The handle would remain the same even if file names + get renamed. Enables open_by_handle_at(2) and name_to_handle_at(2) + syscalls. + config TASKSTATS bool "Export task/process statistics through netlink (EXPERIMENTAL)" depends on NET diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index c782fe9924c7..4e013439ac28 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -186,3 +186,6 @@ cond_syscall(sys_perf_event_open); /* fanotify! */ cond_syscall(sys_fanotify_init); cond_syscall(sys_fanotify_mark); + +/* open by handle */ +cond_syscall(sys_name_to_handle_at); From becfd1f37544798cbdfd788f32c827160fab98c1 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sat, 29 Jan 2011 18:43:26 +0530 Subject: [PATCH 679/706] vfs: Add open by file handle support [AV: duplicate of open() guts removed; file_open_root() used instead] Signed-off-by: Aneesh Kumar K.V Signed-off-by: Al Viro --- fs/compat.c | 13 ++++ fs/exportfs/expfs.c | 2 + fs/fhandle.c | 158 +++++++++++++++++++++++++++++++++++++++ fs/internal.h | 3 + include/linux/syscalls.h | 3 + kernel/sys_ni.c | 2 + 6 files changed, 181 insertions(+) diff --git a/fs/compat.c b/fs/compat.c index a071775f3bb3..c6d31a3bab88 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -2284,3 +2284,16 @@ asmlinkage long compat_sys_timerfd_gettime(int ufd, } #endif /* CONFIG_TIMERFD */ + +#ifdef CONFIG_FHANDLE +/* + * Exactly like fs/open.c:sys_open_by_handle_at(), except that it + * doesn't set the O_LARGEFILE flag. + */ +asmlinkage long +compat_sys_open_by_handle_at(int mountdirfd, + struct file_handle __user *handle, int flags) +{ + return do_handle_open(mountdirfd, handle, flags); +} +#endif diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index cfe55731b6dc..b05acb796135 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -374,6 +374,8 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, /* * Try to get any dentry for the given file handle from the filesystem. */ + if (!nop || !nop->fh_to_dentry) + return ERR_PTR(-ESTALE); result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type); if (!result) result = ERR_PTR(-ESTALE); diff --git a/fs/fhandle.c b/fs/fhandle.c index 9f79e743a840..bf93ad2bee07 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -5,6 +5,8 @@ #include #include #include +#include +#include #include #include "internal.h" @@ -105,3 +107,159 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, } return err; } + +static struct vfsmount *get_vfsmount_from_fd(int fd) +{ + struct path path; + + if (fd == AT_FDCWD) { + struct fs_struct *fs = current->fs; + spin_lock(&fs->lock); + path = fs->pwd; + mntget(path.mnt); + spin_unlock(&fs->lock); + } else { + int fput_needed; + struct file *file = fget_light(fd, &fput_needed); + if (!file) + return ERR_PTR(-EBADF); + path = file->f_path; + mntget(path.mnt); + fput_light(file, fput_needed); + } + return path.mnt; +} + +static int vfs_dentry_acceptable(void *context, struct dentry *dentry) +{ + return 1; +} + +static int do_handle_to_path(int mountdirfd, struct file_handle *handle, + struct path *path) +{ + int retval = 0; + int handle_dwords; + + path->mnt = get_vfsmount_from_fd(mountdirfd); + if (IS_ERR(path->mnt)) { + retval = PTR_ERR(path->mnt); + goto out_err; + } + /* change the handle size to multiple of sizeof(u32) */ + handle_dwords = handle->handle_bytes >> 2; + path->dentry = exportfs_decode_fh(path->mnt, + (struct fid *)handle->f_handle, + handle_dwords, handle->handle_type, + vfs_dentry_acceptable, NULL); + if (IS_ERR(path->dentry)) { + retval = PTR_ERR(path->dentry); + goto out_mnt; + } + return 0; +out_mnt: + mntput(path->mnt); +out_err: + return retval; +} + +static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, + struct path *path) +{ + int retval = 0; + struct file_handle f_handle; + struct file_handle *handle = NULL; + + /* + * With handle we don't look at the execute bit on the + * the directory. Ideally we would like CAP_DAC_SEARCH. + * But we don't have that + */ + if (!capable(CAP_DAC_READ_SEARCH)) { + retval = -EPERM; + goto out_err; + } + if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) { + retval = -EFAULT; + goto out_err; + } + if ((f_handle.handle_bytes > MAX_HANDLE_SZ) || + (f_handle.handle_bytes == 0)) { + retval = -EINVAL; + goto out_err; + } + handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes, + GFP_KERNEL); + if (!handle) { + retval = -ENOMEM; + goto out_err; + } + /* copy the full handle */ + if (copy_from_user(handle, ufh, + sizeof(struct file_handle) + + f_handle.handle_bytes)) { + retval = -EFAULT; + goto out_handle; + } + + retval = do_handle_to_path(mountdirfd, handle, path); + +out_handle: + kfree(handle); +out_err: + return retval; +} + +long do_handle_open(int mountdirfd, + struct file_handle __user *ufh, int open_flag) +{ + long retval = 0; + struct path path; + struct file *file; + int fd; + + retval = handle_to_path(mountdirfd, ufh, &path); + if (retval) + return retval; + + fd = get_unused_fd_flags(open_flag); + if (fd < 0) { + path_put(&path); + return fd; + } + file = file_open_root(path.dentry, path.mnt, "", open_flag); + if (IS_ERR(file)) { + put_unused_fd(fd); + retval = PTR_ERR(file); + } else { + retval = fd; + fsnotify_open(file); + fd_install(fd, file); + } + path_put(&path); + return retval; +} + +/** + * sys_open_by_handle_at: Open the file handle + * @mountdirfd: directory file descriptor + * @handle: file handle to be opened + * @flag: open flags. + * + * @mountdirfd indicate the directory file descriptor + * of the mount point. file handle is decoded relative + * to the vfsmount pointed by the @mountdirfd. @flags + * value is same as the open(2) flags. + */ +SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd, + struct file_handle __user *, handle, + int, flags) +{ + long ret; + + if (force_o_largefile()) + flags |= O_LARGEFILE; + + ret = do_handle_open(mountdirfd, handle, flags); + return ret; +} diff --git a/fs/internal.h b/fs/internal.h index 52abc5287f50..f3d15de44b15 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -117,6 +117,9 @@ extern struct file *do_filp_open(int dfd, const char *pathname, extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, const char *, const struct open_flags *, int lookup_flags); +extern long do_handle_open(int mountdirfd, + struct file_handle __user *ufh, int open_flag); + /* * inode.c */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 970112613fb4..2d9b79c0f224 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -836,4 +836,7 @@ asmlinkage long sys_old_mmap(struct mmap_arg_struct __user *arg); asmlinkage long sys_name_to_handle_at(int dfd, const char __user *name, struct file_handle __user *handle, int __user *mnt_id, int flag); +asmlinkage long sys_open_by_handle_at(int mountdirfd, + struct file_handle __user *handle, + int flags); #endif diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 4e013439ac28..25cc41cd8f33 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -189,3 +189,5 @@ cond_syscall(sys_fanotify_mark); /* open by handle */ cond_syscall(sys_name_to_handle_at); +cond_syscall(sys_open_by_handle_at); +cond_syscall(compat_sys_open_by_handle_at); From aae8a97d3ec30788790d1720b71d76fd8eb44b73 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sat, 29 Jan 2011 18:43:27 +0530 Subject: [PATCH 680/706] fs: Don't allow to create hardlink for deleted file Add inode->i_nlink == 0 check in VFS. Some of the file systems do this internally. A followup patch will remove those instance. This is needed to ensure that with link by handle we don't allow to create hardlink of an unlinked file. The check also prevent a race between unlink and link Signed-off-by: Aneesh Kumar K.V Signed-off-by: Al Viro --- fs/namei.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/namei.c b/fs/namei.c index 83e92bab79a6..33be51a2ddb7 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2906,7 +2906,11 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de return error; mutex_lock(&inode->i_mutex); - error = dir->i_op->link(old_dentry, dir, new_dentry); + /* Make sure we don't allow creating hardlink to an unlinked file */ + if (inode->i_nlink == 0) + error = -ENOENT; + else + error = dir->i_op->link(old_dentry, dir, new_dentry); mutex_unlock(&inode->i_mutex); if (!error) fsnotify_link(dir, inode, new_dentry); From f17b6042073e7000a90063f7edbca59a5bd1caa2 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sat, 29 Jan 2011 18:43:30 +0530 Subject: [PATCH 681/706] fs: Remove i_nlink check from file system link callback Now that VFS check for inode->i_nlink == 0 and returns proper error, remove similar check from file system Signed-off-by: Aneesh Kumar K.V Signed-off-by: Al Viro --- fs/btrfs/inode.c | 3 --- fs/ext3/namei.c | 7 ------- fs/ext4/namei.c | 7 ------- fs/jfs/namei.c | 3 --- fs/reiserfs/namei.c | 4 ---- fs/ubifs/dir.c | 18 ------------------ 6 files changed, 42 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0efdb65953c5..c23f050f47c2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4806,9 +4806,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, int err; int drop_inode = 0; - if (inode->i_nlink == 0) - return -ENOENT; - /* do not allow sys_link's with other subvols of the same device */ if (root->objectid != BTRFS_I(inode)->root->objectid) return -EPERM; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index b27ba71810ec..561f69256266 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -2253,13 +2253,6 @@ static int ext3_link (struct dentry * old_dentry, dquot_initialize(dir); - /* - * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing - * otherwise has the potential to corrupt the orphan inode list. - */ - if (inode->i_nlink == 0) - return -ENOENT; - retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 5485390d32c5..e781b7ea5630 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2304,13 +2304,6 @@ static int ext4_link(struct dentry *old_dentry, dquot_initialize(dir); - /* - * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing - * otherwise has the potential to corrupt the orphan inode list. - */ - if (inode->i_nlink == 0) - return -ENOENT; - retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 5a2b269428a6..3f04a1804931 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -809,9 +809,6 @@ static int jfs_link(struct dentry *old_dentry, if (ip->i_nlink == JFS_LINK_MAX) return -EMLINK; - if (ip->i_nlink == 0) - return -ENOENT; - dquot_initialize(dir); tid = txBegin(ip->i_sb, 0); diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 68fdf45cc6c9..4b2eb564fdad 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -1122,10 +1122,6 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, reiserfs_write_unlock(dir->i_sb); return -EMLINK; } - if (inode->i_nlink == 0) { - reiserfs_write_unlock(dir->i_sb); - return -ENOENT; - } /* inc before scheduling so reiserfs_unlink knows we are here */ inc_nlink(inode); diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 14f64b689d7f..7217d67a80a6 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -522,24 +522,6 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, ubifs_assert(mutex_is_locked(&dir->i_mutex)); ubifs_assert(mutex_is_locked(&inode->i_mutex)); - /* - * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing - * otherwise has the potential to corrupt the orphan inode list. - * - * Indeed, consider a scenario when 'vfs_link(dirA/fileA)' and - * 'vfs_unlink(dirA/fileA, dirB/fileB)' race. 'vfs_link()' does not - * lock 'dirA->i_mutex', so this is possible. Both of the functions - * lock 'fileA->i_mutex' though. Suppose 'vfs_unlink()' wins, and takes - * 'fileA->i_mutex' mutex first. Suppose 'fileA->i_nlink' is 1. In this - * case 'ubifs_unlink()' will drop the last reference, and put 'inodeA' - * to the list of orphans. After this, 'vfs_link()' will link - * 'dirB/fileB' to 'inodeA'. This is a problem because, for example, - * the subsequent 'vfs_unlink(dirB/fileB)' will add the same inode - * to the list of orphans. - */ - if (inode->i_nlink == 0) - return -ENOENT; - err = dbg_check_synced_i_size(inode); if (err) return err; From 7dadb755b082c259f7dd4a95a3a6eb21646a28d5 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sat, 29 Jan 2011 18:43:35 +0530 Subject: [PATCH 682/706] x86: Add new syscalls for x86_32 This patch adds new syscalls to x86_32 Signed-off-by: Aneesh Kumar K.V Signed-off-by: Al Viro --- arch/x86/include/asm/unistd_32.h | 4 +++- arch/x86/kernel/syscall_table_32.S | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index b766a5e8ba0e..f4c4973fc2ac 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -346,10 +346,12 @@ #define __NR_fanotify_init 338 #define __NR_fanotify_mark 339 #define __NR_prlimit64 340 +#define __NR_name_to_handle_at 341 +#define __NR_open_by_handle_at 342 #ifdef __KERNEL__ -#define NR_syscalls 341 +#define NR_syscalls 343 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index b35786dc9b8f..c314b2199efd 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -340,3 +340,5 @@ ENTRY(sys_call_table) .long sys_fanotify_init .long sys_fanotify_mark .long sys_prlimit64 /* 340 */ + .long sys_name_to_handle_at + .long sys_open_by_handle_at From 6aae5f2b2085c5c90964bb78676ea8a6a336e037 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sat, 29 Jan 2011 18:43:37 +0530 Subject: [PATCH 683/706] x86: Add new syscalls for x86_64 This patch add new syscalls to x86_64 Signed-off-by: Aneesh Kumar K.V Signed-off-by: Al Viro --- arch/x86/ia32/ia32entry.S | 2 ++ arch/x86/include/asm/unistd_64.h | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 518bb99c3394..98d353edfff3 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -851,4 +851,6 @@ ia32_sys_call_table: .quad sys_fanotify_init .quad sys32_fanotify_mark .quad sys_prlimit64 /* 340 */ + .quad sys_name_to_handle_at + .quad compat_sys_open_by_handle_at ia32_syscall_end: diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 363e9b8a715b..81a3d5b70235 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -669,6 +669,10 @@ __SYSCALL(__NR_fanotify_init, sys_fanotify_init) __SYSCALL(__NR_fanotify_mark, sys_fanotify_mark) #define __NR_prlimit64 302 __SYSCALL(__NR_prlimit64, sys_prlimit64) +#define __NR_name_to_handle_at 303 +__SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at) +#define __NR_open_by_handle_at 304 +__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR From a51571ccb8be1b88aea502ebba8350519682c16d Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sat, 29 Jan 2011 18:43:38 +0530 Subject: [PATCH 684/706] unistd.h: Add new syscalls numbers to asm-generic Signed-off-by: Aneesh Kumar K.V Signed-off-by: Al Viro --- include/asm-generic/unistd.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h index b969770196c2..57af0338d270 100644 --- a/include/asm-generic/unistd.h +++ b/include/asm-generic/unistd.h @@ -646,9 +646,13 @@ __SYSCALL(__NR_prlimit64, sys_prlimit64) __SYSCALL(__NR_fanotify_init, sys_fanotify_init) #define __NR_fanotify_mark 263 __SYSCALL(__NR_fanotify_mark, sys_fanotify_mark) +#define __NR_name_to_handle_at 264 +__SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at) +#define __NR_open_by_handle_at 265 +__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at) #undef __NR_syscalls -#define __NR_syscalls 264 +#define __NR_syscalls 266 /* * All syscalls below here should go away really, From 93f1c20bc8cdb757be50566eff88d65c3b26881f Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sat, 29 Jan 2011 18:43:38 +0530 Subject: [PATCH 685/706] vfs: Export file system uuid via /proc//mountinfo We add a per superblock uuid field. File systems should update the uuid in the fill_super callback Signed-off-by: Aneesh Kumar K.V Signed-off-by: Al Viro --- fs/namespace.c | 16 ++++++++++++++++ include/linux/fs.h | 1 + 2 files changed, 17 insertions(+) diff --git a/fs/namespace.c b/fs/namespace.c index d1edf26025dc..dffe6f49ab93 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1002,6 +1002,18 @@ const struct seq_operations mounts_op = { .show = show_vfsmnt }; +static int uuid_is_nil(u8 *uuid) +{ + int i; + u8 *cp = (u8 *)uuid; + + for (i = 0; i < 16; i++) { + if (*cp++) + return 0; + } + return 1; +} + static int show_mountinfo(struct seq_file *m, void *v) { struct proc_mounts *p = m->private; @@ -1040,6 +1052,10 @@ static int show_mountinfo(struct seq_file *m, void *v) if (IS_MNT_UNBINDABLE(mnt)) seq_puts(m, " unbindable"); + if (!uuid_is_nil(mnt->mnt_sb->s_uuid)) + /* print the uuid */ + seq_printf(m, " uuid:%pU", mnt->mnt_sb->s_uuid); + /* Filesystem specific data */ seq_puts(m, " - "); show_type(m, sb); diff --git a/include/linux/fs.h b/include/linux/fs.h index 3f64630c0e10..f2143e0942c2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1408,6 +1408,7 @@ struct super_block { wait_queue_head_t s_wait_unfrozen; char s_id[32]; /* Informational name */ + u8 s_uuid[16]; /* UUID */ void *s_fs_info; /* Filesystem private info */ fmode_t s_mode; From 03cb5f03dcb26846fcad345d8c15aae91579a53d Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sat, 29 Jan 2011 18:43:39 +0530 Subject: [PATCH 686/706] ext3: Copy fs UUID to superblock. File system UUID is made available to application via /proc//mountinfo Signed-off-by: Aneesh Kumar K.V Signed-off-by: Al Viro --- fs/ext3/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 85c8cc8f2473..9cc19a1dea8e 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1936,6 +1936,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) sb->s_qcop = &ext3_qctl_operations; sb->dq_op = &ext3_quota_operations; #endif + memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); mutex_init(&sbi->s_resize_lock); From f2fa2ffc2046fdc35f96366d1ec8675f4d578522 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sat, 29 Jan 2011 18:43:40 +0530 Subject: [PATCH 687/706] ext4: Copy fs UUID to superblock File system UUID is made available to application via /proc//mountinfo Signed-off-by: Aneesh Kumar K.V Signed-off-by: Al Viro --- fs/ext4/super.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f6a318f836b2..5977b356a435 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3415,6 +3415,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_qcop = &ext4_qctl_operations; sb->dq_op = &ext4_quota_operations; #endif + memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); + INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); mutex_init(&sbi->s_resize_lock); From 1abf0c718f15a56a0a435588d1b104c7a37dc9bd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 13 Mar 2011 03:51:11 -0400 Subject: [PATCH 688/706] New kind of open files - "location only". New flag for open(2) - O_PATH. Semantics: * pathname is resolved, but the file itself is _NOT_ opened as far as filesystem is concerned. * almost all operations on the resulting descriptors shall fail with -EBADF. Exceptions are: 1) operations on descriptors themselves (i.e. close(), dup(), dup2(), dup3(), fcntl(fd, F_DUPFD), fcntl(fd, F_DUPFD_CLOEXEC, ...), fcntl(fd, F_GETFD), fcntl(fd, F_SETFD, ...)) 2) fcntl(fd, F_GETFL), for a common non-destructive way to check if descriptor is open 3) "dfd" arguments of ...at(2) syscalls, i.e. the starting points of pathname resolution * closing such descriptor does *NOT* affect dnotify or posix locks. * permissions are checked as usual along the way to file; no permission checks are applied to the file itself. Of course, giving such thing to syscall will result in permission checks (at the moment it means checking that starting point of ....at() is a directory and caller has exec permissions on it). fget() and fget_light() return NULL on such descriptors; use of fget_raw() and fget_raw_light() is needed to get them. That protects existing code from dealing with those things. There are two things still missing (they come in the next commits): one is handling of symlinks (right now we refuse to open them that way; see the next commit for semantics related to those) and another is descriptor passing via SCM_RIGHTS datagrams. Signed-off-by: Al Viro --- fs/fcntl.c | 37 ++++++++++++++++++++++---- fs/file_table.c | 53 +++++++++++++++++++++++++++++++++---- fs/namei.c | 2 +- fs/open.c | 35 +++++++++++++++++++----- include/asm-generic/fcntl.h | 4 +++ include/linux/file.h | 2 ++ include/linux/fs.h | 3 +++ 7 files changed, 119 insertions(+), 17 deletions(-) diff --git a/fs/fcntl.c b/fs/fcntl.c index cb1026181bdc..6c82e5bac039 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -131,7 +131,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) SYSCALL_DEFINE1(dup, unsigned int, fildes) { int ret = -EBADF; - struct file *file = fget(fildes); + struct file *file = fget_raw(fildes); if (file) { ret = get_unused_fd(); @@ -426,15 +426,35 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, return err; } +static int check_fcntl_cmd(unsigned cmd) +{ + switch (cmd) { + case F_DUPFD: + case F_DUPFD_CLOEXEC: + case F_GETFD: + case F_SETFD: + case F_GETFL: + return 1; + } + return 0; +} + SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { struct file *filp; long err = -EBADF; - filp = fget(fd); + filp = fget_raw(fd); if (!filp) goto out; + if (unlikely(filp->f_mode & FMODE_PATH)) { + if (!check_fcntl_cmd(cmd)) { + fput(filp); + goto out; + } + } + err = security_file_fcntl(filp, cmd, arg); if (err) { fput(filp); @@ -456,10 +476,17 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, long err; err = -EBADF; - filp = fget(fd); + filp = fget_raw(fd); if (!filp) goto out; + if (unlikely(filp->f_mode & FMODE_PATH)) { + if (!check_fcntl_cmd(cmd)) { + fput(filp); + goto out; + } + } + err = security_file_fcntl(filp, cmd, arg); if (err) { fput(filp); @@ -808,14 +835,14 @@ static int __init fcntl_init(void) * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY * is defined as O_NONBLOCK on some platforms and not on others. */ - BUILD_BUG_ON(18 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( + BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | O_APPEND | /* O_NONBLOCK | */ __O_SYNC | O_DSYNC | FASYNC | O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | O_NOATIME | O_CLOEXEC | - __FMODE_EXEC + __FMODE_EXEC | O_PATH )); fasync_cache = kmem_cache_create("fasync_cache", diff --git a/fs/file_table.c b/fs/file_table.c index eb36b6b17e26..3c16e1ca163e 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -276,11 +276,10 @@ struct file *fget(unsigned int fd) rcu_read_lock(); file = fcheck_files(files, fd); if (file) { - if (!atomic_long_inc_not_zero(&file->f_count)) { - /* File object ref couldn't be taken */ - rcu_read_unlock(); - return NULL; - } + /* File object ref couldn't be taken */ + if (file->f_mode & FMODE_PATH || + !atomic_long_inc_not_zero(&file->f_count)) + file = NULL; } rcu_read_unlock(); @@ -289,6 +288,23 @@ struct file *fget(unsigned int fd) EXPORT_SYMBOL(fget); +struct file *fget_raw(unsigned int fd) +{ + struct file *file; + struct files_struct *files = current->files; + + rcu_read_lock(); + file = fcheck_files(files, fd); + if (file) { + /* File object ref couldn't be taken */ + if (!atomic_long_inc_not_zero(&file->f_count)) + file = NULL; + } + rcu_read_unlock(); + + return file; +} + /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. * @@ -310,6 +326,33 @@ struct file *fget_light(unsigned int fd, int *fput_needed) struct file *file; struct files_struct *files = current->files; + *fput_needed = 0; + if (atomic_read(&files->count) == 1) { + file = fcheck_files(files, fd); + if (file && (file->f_mode & FMODE_PATH)) + file = NULL; + } else { + rcu_read_lock(); + file = fcheck_files(files, fd); + if (file) { + if (!(file->f_mode & FMODE_PATH) && + atomic_long_inc_not_zero(&file->f_count)) + *fput_needed = 1; + else + /* Didn't get the reference, someone's freed */ + file = NULL; + } + rcu_read_unlock(); + } + + return file; +} + +struct file *fget_raw_light(unsigned int fd, int *fput_needed) +{ + struct file *file; + struct files_struct *files = current->files; + *fput_needed = 0; if (atomic_read(&files->count) == 1) { file = fcheck_files(files, fd); diff --git a/fs/namei.c b/fs/namei.c index 33be51a2ddb7..e1d9f90d9776 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1544,7 +1544,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, } else { struct dentry *dentry; - file = fget_light(dfd, &fput_needed); + file = fget_raw_light(dfd, &fput_needed); retval = -EBADF; if (!file) goto out_fail; diff --git a/fs/open.c b/fs/open.c index 48afc5c139d2..14a51de01f54 100644 --- a/fs/open.c +++ b/fs/open.c @@ -669,11 +669,16 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, int (*open)(struct inode *, struct file *), const struct cred *cred) { + static const struct file_operations empty_fops = {}; struct inode *inode; int error; f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; + + if (unlikely(f->f_flags & O_PATH)) + f->f_mode = FMODE_PATH; + inode = dentry->d_inode; if (f->f_mode & FMODE_WRITE) { error = __get_file_write_access(inode, mnt); @@ -687,9 +692,15 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, f->f_path.dentry = dentry; f->f_path.mnt = mnt; f->f_pos = 0; - f->f_op = fops_get(inode->i_fop); file_sb_list_add(f, inode->i_sb); + if (unlikely(f->f_mode & FMODE_PATH)) { + f->f_op = &empty_fops; + return f; + } + + f->f_op = fops_get(inode->i_fop); + error = security_dentry_open(f, cred); if (error) goto cleanup_all; @@ -911,9 +922,18 @@ static inline int build_open_flags(int flags, int mode, struct open_flags *op) if (flags & __O_SYNC) flags |= O_DSYNC; - op->open_flag = flags; + /* + * If we have O_PATH in the open flag. Then we + * cannot have anything other than the below set of flags + */ + if (flags & O_PATH) { + flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH; + acc_mode = 0; + } else { + acc_mode = MAY_OPEN | ACC_MODE(flags); + } - acc_mode = MAY_OPEN | ACC_MODE(flags); + op->open_flag = flags; /* O_TRUNC implies we need access checks for write permissions */ if (flags & O_TRUNC) @@ -926,7 +946,8 @@ static inline int build_open_flags(int flags, int mode, struct open_flags *op) op->acc_mode = acc_mode; - op->intent = LOOKUP_OPEN; + op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN; + if (flags & O_CREAT) { op->intent |= LOOKUP_CREATE; if (flags & O_EXCL) @@ -1053,8 +1074,10 @@ int filp_close(struct file *filp, fl_owner_t id) if (filp->f_op && filp->f_op->flush) retval = filp->f_op->flush(filp, id); - dnotify_flush(filp, id); - locks_remove_posix(filp, id); + if (likely(!(filp->f_mode & FMODE_PATH))) { + dnotify_flush(filp, id); + locks_remove_posix(filp, id); + } fput(filp); return retval; } diff --git a/include/asm-generic/fcntl.h b/include/asm-generic/fcntl.h index 0fc16e3f0bfc..84793c7025e2 100644 --- a/include/asm-generic/fcntl.h +++ b/include/asm-generic/fcntl.h @@ -80,6 +80,10 @@ #define O_SYNC (__O_SYNC|O_DSYNC) #endif +#ifndef O_PATH +#define O_PATH 010000000 +#endif + #ifndef O_NDELAY #define O_NDELAY O_NONBLOCK #endif diff --git a/include/linux/file.h b/include/linux/file.h index e85baebf6279..21a79958541c 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -29,6 +29,8 @@ static inline void fput_light(struct file *file, int fput_needed) extern struct file *fget(unsigned int fd); extern struct file *fget_light(unsigned int fd, int *fput_needed); +extern struct file *fget_raw(unsigned int fd); +extern struct file *fget_raw_light(unsigned int fd, int *fput_needed); extern void set_close_on_exec(unsigned int fd, int flag); extern void put_filp(struct file *); extern int alloc_fd(unsigned start, unsigned flags); diff --git a/include/linux/fs.h b/include/linux/fs.h index f2143e0942c2..13df14e2c42e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -102,6 +102,9 @@ struct inodes_stat_t { /* File is huge (eg. /dev/kmem): treat loff_t as unsigned */ #define FMODE_UNSIGNED_OFFSET ((__force fmode_t)0x2000) +/* File is opened with O_PATH; almost nothing can be done with it */ +#define FMODE_PATH ((__force fmode_t)0x4000) + /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x1000000) From bcda76524cd1fa32af748536f27f674a13e56700 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 13 Mar 2011 16:42:14 -0400 Subject: [PATCH 689/706] Allow O_PATH for symlinks At that point we can't do almost nothing with them. They can be opened with O_PATH, we can manipulate such descriptors with dup(), etc. and we can see them in /proc/*/{fd,fdinfo}/*. We can't (and won't be able to) follow /proc/*/fd/* symlinks for those; there's simply not enough information for pathname resolution to go on from such point - to resolve a symlink we need to know which directory does it live in. We will be able to do useful things with them after the next commit, though - readlinkat() and fchownat() will be possible to use with dfd being an O_PATH-opened symlink and empty relative pathname. Combined with open_by_handle() it'll give us a way to do realink-by-handle and lchown-by-handle without messing with more redundant syscalls. Signed-off-by: Al Viro --- fs/namei.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index e1d9f90d9776..9d4f32700179 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -766,8 +766,14 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p) error = 0; if (s) error = __vfs_follow_link(nd, s); - else if (nd->last_type == LAST_BIND) + else if (nd->last_type == LAST_BIND) { nd->flags |= LOOKUP_JUMPED; + if (nd->path.dentry->d_inode->i_op->follow_link) { + /* stepped on a _really_ weird one */ + path_put(&nd->path); + error = -ELOOP; + } + } } return error; } @@ -1954,6 +1960,10 @@ static int may_open(struct path *path, int acc_mode, int flag) struct inode *inode = dentry->d_inode; int error; + /* O_PATH? */ + if (!acc_mode) + return 0; + if (!inode) return -ENOENT; @@ -2056,7 +2066,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, int open_flag = op->open_flag; int will_truncate = open_flag & O_TRUNC; int want_write = 0; - int skip_perm = 0; + int acc_mode = op->acc_mode; struct file *filp; struct inode *inode; int error; @@ -2095,8 +2105,11 @@ static struct file *do_last(struct nameidata *nd, struct path *path, } if (!(open_flag & O_CREAT)) { + int symlink_ok = 0; if (nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; + if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW)) + symlink_ok = 1; /* we _can_ be in RCU mode here */ error = do_lookup(nd, &nd->last, path, &inode); if (error) { @@ -2108,7 +2121,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, terminate_walk(nd); return ERR_PTR(-ENOENT); } - if (unlikely(inode->i_op->follow_link)) { + if (unlikely(inode->i_op->follow_link && !symlink_ok)) { /* We drop rcu-walk here */ if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry)) return ERR_PTR(-ECHILD); @@ -2175,7 +2188,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, /* Don't check for write permission, don't truncate */ open_flag &= ~O_TRUNC; will_truncate = 0; - skip_perm = 1; + acc_mode = MAY_OPEN; error = security_path_mknod(&nd->path, dentry, mode, 0); if (error) goto exit_mutex_unlock; @@ -2225,7 +2238,7 @@ ok: want_write = 1; } common: - error = may_open(&nd->path, skip_perm ? 0 : op->acc_mode, open_flag); + error = may_open(&nd->path, acc_mode, open_flag); if (error) goto exit; filp = nameidata_to_filp(nd); @@ -2358,7 +2371,7 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, flags |= LOOKUP_ROOT; - if (dentry->d_inode->i_op->follow_link) + if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN) return ERR_PTR(-ELOOP); file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU); From 65cfc6722361570bfe255698d9cd4dccaf47570d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 13 Mar 2011 15:56:26 -0400 Subject: [PATCH 690/706] readlinkat(), fchownat() and fstatat() with empty relative pathnames For readlinkat() we simply allow empty pathname; it will fail unless we have dfd equal to O_PATH-opened symlink, so we are outside of POSIX scope here. For fchownat() and fstatat() we allow AT_EMPTY_PATH; let the caller explicitly ask for such behaviour. Signed-off-by: Al Viro --- fs/open.c | 10 ++++++---- fs/stat.c | 7 +++++-- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/open.c b/fs/open.c index 14a51de01f54..3cac0bda46df 100644 --- a/fs/open.c +++ b/fs/open.c @@ -573,13 +573,15 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, { struct path path; int error = -EINVAL; - int follow; + int lookup_flags; - if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0) + if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) goto out; - follow = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; - error = user_path_at(dfd, filename, follow, &path); + lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; + if (flag & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + error = user_path_at(dfd, filename, lookup_flags, &path); if (error) goto out; error = mnt_want_write(path.mnt); diff --git a/fs/stat.c b/fs/stat.c index d5c61cf2b703..961039121cb8 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -75,13 +75,16 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, int error = -EINVAL; int lookup_flags = 0; - if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)) != 0) + if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | + AT_EMPTY_PATH)) != 0) goto out; if (!(flag & AT_SYMLINK_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; if (flag & AT_NO_AUTOMOUNT) lookup_flags |= LOOKUP_NO_AUTOMOUNT; + if (flag & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; error = user_path_at(dfd, filename, lookup_flags, &path); if (error) @@ -297,7 +300,7 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname, if (bufsiz <= 0) return -EINVAL; - error = user_path_at(dfd, pathname, 0, &path); + error = user_path_at(dfd, pathname, LOOKUP_EMPTY, &path); if (!error) { struct inode *inode = path.dentry->d_inode; From 326be7b484843988afe57566b627fb7a70beac56 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 13 Mar 2011 17:08:22 -0400 Subject: [PATCH 691/706] Allow passing O_PATH descriptors via SCM_RIGHTS datagrams Just need to make sure that AF_UNIX garbage collector won't confuse O_PATHed socket on filesystem for real AF_UNIX opened socket. Signed-off-by: Al Viro --- fs/file_table.c | 2 ++ net/core/scm.c | 2 +- net/unix/garbage.c | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/file_table.c b/fs/file_table.c index 3c16e1ca163e..74a9544ac770 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -305,6 +305,8 @@ struct file *fget_raw(unsigned int fd) return file; } +EXPORT_SYMBOL(fget_raw); + /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. * diff --git a/net/core/scm.c b/net/core/scm.c index bbe454450801..4c1ef026d695 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -95,7 +95,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) int fd = fdp[i]; struct file *file; - if (fd < 0 || !(file = fget(fd))) + if (fd < 0 || !(file = fget_raw(fd))) return -EBADF; *fpp++ = file; fpl->count++; diff --git a/net/unix/garbage.c b/net/unix/garbage.c index f89f83bf828e..b6f4b994eb35 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -104,7 +104,7 @@ struct sock *unix_get_socket(struct file *filp) /* * Socket ? */ - if (S_ISSOCK(inode->i_mode)) { + if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { struct socket *sock = SOCKET_I(inode); struct sock *s = sock->sk; From 25542c646afbf14c43fa7d2b443055cadb73b07a Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 15 Mar 2011 09:57:37 +0800 Subject: [PATCH 692/706] x86, tlb, UV: Do small micro-optimization for native_flush_tlb_others() native_flush_tlb_others() is called from: flush_tlb_current_task() flush_tlb_mm() flush_tlb_page() All these functions disable preemption explicitly, so we can use smp_processor_id() instead of get_cpu() and put_cpu(). Signed-off-by: Xiao Guangrong Cc: Cliff Wickman LKML-Reference: <4D7EC791.4040003@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 55272d7c3b0b..d6c0418c3e47 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -208,11 +208,10 @@ void native_flush_tlb_others(const struct cpumask *cpumask, if (is_uv_system()) { unsigned int cpu; - cpu = get_cpu(); + cpu = smp_processor_id(); cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); if (cpumask) flush_tlb_others_ipi(cpumask, mm, va); - put_cpu(); return; } flush_tlb_others_ipi(cpumask, mm, va); From 5e814dd597c42daeb8d2a276e64a6ec986ad0e2a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 15 Mar 2011 20:51:09 +0100 Subject: [PATCH 693/706] perf probe: Clean up probe_point_lazy_walker() return value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Newer compilers (gcc 4.6) complains about: return ret < 0 ?: 0; For the following reason: util/probe-finder.c: In function ‘probe_point_lazy_walker’: util/probe-finder.c:1331:18: error: the omitted middle operand in ?: will always be ‘true’, suggest explicit middle operand [-Werror=parentheses] And indeed the return value is a somewhat obscure (but correct) value of 'true', so return 'ret' instead - this is cleaner and unconfuses GCC as well. Cc: Arnaldo Carvalho de Melo Cc: Masami Hiramatsu Cc: Frederic Weisbecker Cc: Masami Hiramatsu Cc: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- tools/perf/util/probe-finder.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index 17f9c4a66ddd..194f9e2a3285 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -1328,7 +1328,7 @@ static int probe_point_lazy_walker(const char *fname, int lineno, * Continue if no error, because the lazy pattern will match * to other lines */ - return ret < 0 ?: 0; + return ret < 0 ? ret : 0; } /* Find probe points from lazy pattern */ From 11a7b371b64ef39fc5fb1b6f2218eef7c4d035e3 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sat, 29 Jan 2011 18:43:42 +0530 Subject: [PATCH 694/706] fs: allow AT_EMPTY_PATH in linkat(), limit that to CAP_DAC_READ_SEARCH We don't want to allow creation of private hardlinks by different application using the fd passed to them via SCM_RIGHTS. So limit the null relative name usage in linkat syscall to CAP_DAC_READ_SEARCH Signed-off-by: Aneesh Kumar K.V --- fs/namei.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 9d4f32700179..c9b7f5b7e92a 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2945,15 +2945,27 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, struct dentry *new_dentry; struct nameidata nd; struct path old_path; + int how = 0; int error; char *to; - if ((flags & ~AT_SYMLINK_FOLLOW) != 0) + if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; + /* + * To use null names we require CAP_DAC_READ_SEARCH + * This ensures that not everyone will be able to create + * handlink using the passed filedescriptor. + */ + if (flags & AT_EMPTY_PATH) { + if (!capable(CAP_DAC_READ_SEARCH)) + return -ENOENT; + how = LOOKUP_EMPTY; + } - error = user_path_at(olddfd, oldname, - flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0, - &old_path); + if (flags & AT_SYMLINK_FOLLOW) + how |= LOOKUP_FOLLOW; + + error = user_path_at(olddfd, oldname, how, &old_path); if (error) return error; From ce57dfc1791221ef58b6d6b8f5437fccefc4e187 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 13 Mar 2011 19:58:58 -0400 Subject: [PATCH 695/706] pull handling of one pathname component into a helper new helper: walk_component(). Handles everything except symlinks; returns negative on error, 0 on success and 1 on symlinks we decided to follow. Drops out of RCU mode on such symlinks. link_path_walk() and do_last() switched to using that. Signed-off-by: Al Viro --- fs/namei.c | 123 ++++++++++++++++++++++++----------------------------- 1 file changed, 55 insertions(+), 68 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index c9b7f5b7e92a..549bbe2f25c6 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -785,16 +785,11 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p) * Without that kind of total limit, nasty chains of consecutive * symlinks can cause almost arbitrarily long lookups. */ -static inline int do_follow_link(struct inode *inode, struct path *path, struct nameidata *nd) +static inline int do_follow_link(struct path *path, struct nameidata *nd) { void *cookie; int err = -ELOOP; - /* We drop rcu-walk here */ - if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry)) - return -ECHILD; - BUG_ON(inode != path->dentry->d_inode); - if (current->link_count >= MAX_NESTED_LINKS) goto loop; if (current->total_link_count >= 40) @@ -1337,6 +1332,39 @@ static void terminate_walk(struct nameidata *nd) } } +static inline int walk_component(struct nameidata *nd, struct path *path, + struct qstr *name, int type, int follow) +{ + struct inode *inode; + int err; + /* + * "." and ".." are special - ".." especially so because it has + * to be able to know about the current root directory and + * parent relationships. + */ + if (unlikely(type != LAST_NORM)) + return handle_dots(nd, type); + err = do_lookup(nd, name, path, &inode); + if (unlikely(err)) { + terminate_walk(nd); + return err; + } + if (!inode) { + path_to_nameidata(path, nd); + terminate_walk(nd); + return -ENOENT; + } + if (unlikely(inode->i_op->follow_link) && follow) { + if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry)) + return -ECHILD; + BUG_ON(inode != path->dentry->d_inode); + return 1; + } + path_to_nameidata(path, nd); + nd->inode = inode; + return 0; +} + /* * Name resolution. * This is the basic name resolution function, turning a pathname into @@ -1361,7 +1389,6 @@ static int link_path_walk(const char *name, struct nameidata *nd) /* At this point we know we have a real path component. */ for(;;) { - struct inode *inode; unsigned long hash; struct qstr this; unsigned int c; @@ -1414,34 +1441,16 @@ static int link_path_walk(const char *name, struct nameidata *nd) if (!*name) goto last_with_slashes; - /* - * "." and ".." are special - ".." especially so because it has - * to be able to know about the current root directory and - * parent relationships. - */ - if (unlikely(type != LAST_NORM)) { - if (handle_dots(nd, type)) - return -ECHILD; - continue; - } + err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW); + if (err < 0) + return err; - /* This does the actual lookups.. */ - err = do_lookup(nd, &this, &next, &inode); - if (err) - break; - - if (inode && inode->i_op->follow_link) { - err = do_follow_link(inode, &next, nd); + if (err) { + err = do_follow_link(&next, nd); if (err) return err; nd->inode = nd->path.dentry->d_inode; - } else { - path_to_nameidata(&next, nd); - nd->inode = inode; } - err = -ENOENT; - if (!nd->inode) - break; err = -ENOTDIR; if (!nd->inode->i_op->lookup) break; @@ -1453,36 +1462,27 @@ last_with_slashes: last_component: /* Clear LOOKUP_CONTINUE iff it was previously unset */ nd->flags &= lookup_flags | ~LOOKUP_CONTINUE; - if (lookup_flags & LOOKUP_PARENT) - goto lookup_parent; - if (unlikely(type != LAST_NORM)) - return handle_dots(nd, type); - err = do_lookup(nd, &this, &next, &inode); - if (err) - break; - if (inode && unlikely(inode->i_op->follow_link) && - (lookup_flags & LOOKUP_FOLLOW)) { - err = do_follow_link(inode, &next, nd); + if (lookup_flags & LOOKUP_PARENT) { + nd->last = this; + nd->last_type = type; + return 0; + } + err = walk_component(nd, &next, &this, type, + lookup_flags & LOOKUP_FOLLOW); + if (err < 0) + return err; + if (err) { + err = do_follow_link(&next, nd); if (err) return err; nd->inode = nd->path.dentry->d_inode; - } else { - path_to_nameidata(&next, nd); - nd->inode = inode; } - err = -ENOENT; - if (!nd->inode) - break; if (lookup_flags & LOOKUP_DIRECTORY) { err = -ENOTDIR; if (!nd->inode->i_op->lookup) break; } return 0; -lookup_parent: - nd->last = this; - nd->last_type = type; - return 0; } terminate_walk(nd); return err; @@ -2068,7 +2068,6 @@ static struct file *do_last(struct nameidata *nd, struct path *path, int want_write = 0; int acc_mode = op->acc_mode; struct file *filp; - struct inode *inode; int error; nd->flags &= ~LOOKUP_PARENT; @@ -2111,24 +2110,12 @@ static struct file *do_last(struct nameidata *nd, struct path *path, if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW)) symlink_ok = 1; /* we _can_ be in RCU mode here */ - error = do_lookup(nd, &nd->last, path, &inode); - if (error) { - terminate_walk(nd); + error = walk_component(nd, path, &nd->last, LAST_NORM, + !symlink_ok); + if (error < 0) return ERR_PTR(error); - } - if (!inode) { - path_to_nameidata(path, nd); - terminate_walk(nd); - return ERR_PTR(-ENOENT); - } - if (unlikely(inode->i_op->follow_link && !symlink_ok)) { - /* We drop rcu-walk here */ - if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry)) - return ERR_PTR(-ECHILD); + if (error) /* symlink */ return NULL; - } - path_to_nameidata(path, nd); - nd->inode = inode; /* sayonara */ if (nd->flags & LOOKUP_RCU) { if (nameidata_drop_rcu_last(nd)) @@ -2137,7 +2124,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, error = -ENOTDIR; if (nd->flags & LOOKUP_DIRECTORY) { - if (!inode->i_op->lookup) + if (!nd->inode->i_op->lookup) goto exit; } audit_inode(pathname, nd->path.dentry); From b21041d0f72899ed815bd2cbf7275339c74737b6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 14 Mar 2011 20:01:51 -0400 Subject: [PATCH 696/706] update nd->inode in __do_follow_link() instead of after do_follow_link() ... and note that we only need to do it for LAST_BIND symlinks Signed-off-by: Al Viro --- fs/namei.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 549bbe2f25c6..9e7b18a8be66 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -768,7 +768,8 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p) error = __vfs_follow_link(nd, s); else if (nd->last_type == LAST_BIND) { nd->flags |= LOOKUP_JUMPED; - if (nd->path.dentry->d_inode->i_op->follow_link) { + nd->inode = nd->path.dentry->d_inode; + if (nd->inode->i_op->follow_link) { /* stepped on a _really_ weird one */ path_put(&nd->path); error = -ELOOP; @@ -1449,7 +1450,6 @@ static int link_path_walk(const char *name, struct nameidata *nd) err = do_follow_link(&next, nd); if (err) return err; - nd->inode = nd->path.dentry->d_inode; } err = -ENOTDIR; if (!nd->inode->i_op->lookup) @@ -1475,7 +1475,6 @@ last_component: err = do_follow_link(&next, nd); if (err) return err; - nd->inode = nd->path.dentry->d_inode; } if (lookup_flags & LOOKUP_DIRECTORY) { err = -ENOTDIR; From bd92d7fed877ed1e6997e4f3f13dbcd872947653 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 14 Mar 2011 19:54:59 -0400 Subject: [PATCH 697/706] Make trailing symlink resolution in path_lookupat() iterative Now the only caller of link_path_walk() that does *not* pass LOOKUP_PARENT is do_follow_link() Signed-off-by: Al Viro --- fs/namei.c | 63 +++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 53 insertions(+), 10 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 9e7b18a8be66..a3431639e166 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1588,12 +1588,23 @@ out_fail: return retval; } +static inline int lookup_last(struct nameidata *nd, struct path *path) +{ + if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len]) + nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; + + nd->flags &= ~LOOKUP_PARENT; + return walk_component(nd, path, &nd->last, nd->last_type, + nd->flags & LOOKUP_FOLLOW); +} + /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ static int path_lookupat(int dfd, const char *name, unsigned int flags, struct nameidata *nd) { struct file *base = NULL; - int retval; + struct path path; + int err; /* * Path walking is largely split up into 2 different synchronisation @@ -1609,23 +1620,55 @@ static int path_lookupat(int dfd, const char *name, * be handled by restarting a traditional ref-walk (which will always * be able to complete). */ - retval = path_init(dfd, name, flags, nd, &base); + err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base); - if (unlikely(retval)) - return retval; + if (unlikely(err)) + return err; current->total_link_count = 0; - retval = link_path_walk(name, nd); + err = link_path_walk(name, nd); + + if (!err && !(flags & LOOKUP_PARENT)) { + int count = 0; + err = lookup_last(nd, &path); + while (err > 0) { + void *cookie; + struct path link = path; + struct inode *inode = link.dentry->d_inode; + + if (count++ > 32) { + path_put_conditional(&path, nd); + path_put(&nd->path); + err = -ELOOP; + break; + } + cond_resched(); + nd->flags |= LOOKUP_PARENT; + err = __do_follow_link(&link, nd, &cookie); + if (!err) + err = lookup_last(nd, &path); + if (!IS_ERR(cookie) && inode->i_op->put_link) + inode->i_op->put_link(link.dentry, nd, cookie); + path_put(&link); + } + } if (nd->flags & LOOKUP_RCU) { /* went all way through without dropping RCU */ - BUG_ON(retval); + BUG_ON(err); if (nameidata_drop_rcu_last(nd)) - retval = -ECHILD; + err = -ECHILD; } - if (!retval) - retval = handle_reval_path(nd); + if (!err) + err = handle_reval_path(nd); + + if (!err && nd->flags & LOOKUP_DIRECTORY) { + if (!nd->inode->i_op->lookup) { + path_put(&nd->path); + return -ENOTDIR; + } + } if (base) fput(base); @@ -1634,7 +1677,7 @@ static int path_lookupat(int dfd, const char *name, path_put(&nd->root); nd->root.mnt = NULL; } - return retval; + return err; } static int do_path_lookup(int dfd, const char *name, From ce0525449da56444948c368f52e10f3db0465338 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 14 Mar 2011 21:28:04 -0400 Subject: [PATCH 698/706] simplify link_path_walk() tail Now that link_path_walk() is called without LOOKUP_PARENT only from do_follow_link(), we can simplify the checks in last component handling. First of all, checking if we'd arrived to a directory is not needed - the caller will check it anyway. And LOOKUP_FOLLOW is guaranteed to be there, since we only get to that place with nd->depth > 0. Signed-off-by: Al Viro --- fs/namei.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index a3431639e166..9575d0039699 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1467,8 +1467,7 @@ last_component: nd->last_type = type; return 0; } - err = walk_component(nd, &next, &this, type, - lookup_flags & LOOKUP_FOLLOW); + err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW); if (err < 0) return err; if (err) { @@ -1476,11 +1475,6 @@ last_component: if (err) return err; } - if (lookup_flags & LOOKUP_DIRECTORY) { - err = -ENOTDIR; - if (!nd->inode->i_op->lookup) - break; - } return 0; } terminate_walk(nd); From b356379a020bb7197603118bb1cbc903963aa198 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 14 Mar 2011 21:54:55 -0400 Subject: [PATCH 699/706] Turn resolution of trailing symlinks iterative everywhere The last remaining place (resolution of nested symlink) converted to the loop of the same kind we have in path_lookupat() and path_openat(). Note that we still *do* have a recursion in pathname resolution; can't avoid it, really. However, it's strictly for nested symlinks now - i.e. ones in the middle of a pathname. link_path_walk() has lost the tail now - it always walks everything except the last component. do_follow_link() renamed to nested_symlink() and moved down. Signed-off-by: Al Viro --- fs/namei.c | 104 ++++++++++++++++++++++++++--------------------------- 1 file changed, 50 insertions(+), 54 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 9575d0039699..017c3fa3a08e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -779,40 +779,6 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p) return error; } -/* - * This limits recursive symlink follows to 8, while - * limiting consecutive symlinks to 40. - * - * Without that kind of total limit, nasty chains of consecutive - * symlinks can cause almost arbitrarily long lookups. - */ -static inline int do_follow_link(struct path *path, struct nameidata *nd) -{ - void *cookie; - int err = -ELOOP; - - if (current->link_count >= MAX_NESTED_LINKS) - goto loop; - if (current->total_link_count >= 40) - goto loop; - BUG_ON(nd->depth >= MAX_NESTED_LINKS); - cond_resched(); - current->link_count++; - current->total_link_count++; - nd->depth++; - err = __do_follow_link(path, nd, &cookie); - if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link) - path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie); - path_put(path); - current->link_count--; - nd->depth--; - return err; -loop: - path_put_conditional(path, nd); - path_put(&nd->path); - return err; -} - static int follow_up_rcu(struct path *path) { struct vfsmount *parent; @@ -1366,6 +1332,52 @@ static inline int walk_component(struct nameidata *nd, struct path *path, return 0; } +/* + * This limits recursive symlink follows to 8, while + * limiting consecutive symlinks to 40. + * + * Without that kind of total limit, nasty chains of consecutive + * symlinks can cause almost arbitrarily long lookups. + */ +static inline int nested_symlink(struct path *path, struct nameidata *nd) +{ + int res; + + BUG_ON(nd->depth >= MAX_NESTED_LINKS); + if (unlikely(current->link_count >= MAX_NESTED_LINKS)) { + path_put_conditional(path, nd); + path_put(&nd->path); + return -ELOOP; + } + + nd->depth++; + current->link_count++; + + do { + struct path link = *path; + void *cookie; + if (unlikely(current->total_link_count >= 40)) { + path_put_conditional(path, nd); + path_put(&nd->path); + res = -ELOOP; + break; + } + cond_resched(); + current->total_link_count++; + res = __do_follow_link(&link, nd, &cookie); + if (!res) + res = walk_component(nd, path, &nd->last, + nd->last_type, LOOKUP_FOLLOW); + if (!IS_ERR(cookie) && link.dentry->d_inode->i_op->put_link) + link.dentry->d_inode->i_op->put_link(link.dentry, nd, cookie); + path_put(&link); + } while (res > 0); + + current->link_count--; + nd->depth--; + return res; +} + /* * Name resolution. * This is the basic name resolution function, turning a pathname into @@ -1385,9 +1397,6 @@ static int link_path_walk(const char *name, struct nameidata *nd) if (!*name) return 0; - if (nd->depth) - lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE); - /* At this point we know we have a real path component. */ for(;;) { unsigned long hash; @@ -1440,14 +1449,14 @@ static int link_path_walk(const char *name, struct nameidata *nd) goto last_component; while (*++name == '/'); if (!*name) - goto last_with_slashes; + goto last_component; err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW); if (err < 0) return err; if (err) { - err = do_follow_link(&next, nd); + err = nested_symlink(&next, nd); if (err) return err; } @@ -1457,24 +1466,11 @@ static int link_path_walk(const char *name, struct nameidata *nd) continue; /* here ends the main loop */ -last_with_slashes: - lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; last_component: /* Clear LOOKUP_CONTINUE iff it was previously unset */ nd->flags &= lookup_flags | ~LOOKUP_CONTINUE; - if (lookup_flags & LOOKUP_PARENT) { - nd->last = this; - nd->last_type = type; - return 0; - } - err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW); - if (err < 0) - return err; - if (err) { - err = do_follow_link(&next, nd); - if (err) - return err; - } + nd->last = this; + nd->last_type = type; return 0; } terminate_walk(nd); From 574197e0de46a8a4db5c54ef7b65e43ffa8873a7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 14 Mar 2011 22:20:34 -0400 Subject: [PATCH 700/706] tidy the trailing symlinks traversal up * pull the handling of current->total_link_count into __do_follow_link() * put the common "do ->put_link() if needed and path_put() the link" stuff into a helper (put_link(nd, link, cookie)) * rename __do_follow_link() to follow_link(), while we are at it Signed-off-by: Al Viro --- fs/namei.c | 71 ++++++++++++++++++++---------------------------------- 1 file changed, 26 insertions(+), 45 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 017c3fa3a08e..0a601cae23de 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -737,14 +737,31 @@ static inline void path_to_nameidata(const struct path *path, nd->path.dentry = path->dentry; } +static inline void put_link(struct nameidata *nd, struct path *link, void *cookie) +{ + struct inode *inode = link->dentry->d_inode; + if (!IS_ERR(cookie) && inode->i_op->put_link) + inode->i_op->put_link(link->dentry, nd, cookie); + path_put(link); +} + static __always_inline int -__do_follow_link(const struct path *link, struct nameidata *nd, void **p) +follow_link(struct path *link, struct nameidata *nd, void **p) { int error; struct dentry *dentry = link->dentry; BUG_ON(nd->flags & LOOKUP_RCU); + if (unlikely(current->total_link_count >= 40)) { + *p = ERR_PTR(-ELOOP); /* no ->put_link(), please */ + path_put_conditional(link, nd); + path_put(&nd->path); + return -ELOOP; + } + cond_resched(); + current->total_link_count++; + touch_atime(link->mnt, dentry); nd_set_link(nd, NULL); @@ -1356,21 +1373,12 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd) do { struct path link = *path; void *cookie; - if (unlikely(current->total_link_count >= 40)) { - path_put_conditional(path, nd); - path_put(&nd->path); - res = -ELOOP; - break; - } - cond_resched(); - current->total_link_count++; - res = __do_follow_link(&link, nd, &cookie); + + res = follow_link(&link, nd, &cookie); if (!res) res = walk_component(nd, path, &nd->last, nd->last_type, LOOKUP_FOLLOW); - if (!IS_ERR(cookie) && link.dentry->d_inode->i_op->put_link) - link.dentry->d_inode->i_op->put_link(link.dentry, nd, cookie); - path_put(&link); + put_link(nd, &link, cookie); } while (res > 0); current->link_count--; @@ -1619,27 +1627,15 @@ static int path_lookupat(int dfd, const char *name, err = link_path_walk(name, nd); if (!err && !(flags & LOOKUP_PARENT)) { - int count = 0; err = lookup_last(nd, &path); while (err > 0) { void *cookie; struct path link = path; - struct inode *inode = link.dentry->d_inode; - - if (count++ > 32) { - path_put_conditional(&path, nd); - path_put(&nd->path); - err = -ELOOP; - break; - } - cond_resched(); nd->flags |= LOOKUP_PARENT; - err = __do_follow_link(&link, nd, &cookie); + err = follow_link(&link, nd, &cookie); if (!err) err = lookup_last(nd, &path); - if (!IS_ERR(cookie) && inode->i_op->put_link) - inode->i_op->put_link(link.dentry, nd, cookie); - path_put(&link); + put_link(nd, &link, cookie); } } @@ -2298,7 +2294,6 @@ static struct file *path_openat(int dfd, const char *pathname, struct file *base = NULL; struct file *filp; struct path path; - int count = 0; int error; filp = get_empty_filp(); @@ -2322,35 +2317,21 @@ static struct file *path_openat(int dfd, const char *pathname, filp = do_last(nd, &path, op, pathname); while (unlikely(!filp)) { /* trailing symlink */ struct path link = path; - struct inode *linki = link.dentry->d_inode; void *cookie; - if (!(nd->flags & LOOKUP_FOLLOW) || count++ == 32) { + if (!(nd->flags & LOOKUP_FOLLOW)) { path_put_conditional(&path, nd); path_put(&nd->path); filp = ERR_PTR(-ELOOP); break; } - /* - * This is subtle. Instead of calling do_follow_link() we do - * the thing by hands. The reason is that this way we have zero - * link_count and path_walk() (called from ->follow_link) - * honoring LOOKUP_PARENT. After that we have the parent and - * last component, i.e. we are in the same situation as after - * the first path_walk(). Well, almost - if the last component - * is normal we get its copy stored in nd->last.name and we will - * have to putname() it when we are done. Procfs-like symlinks - * just set LAST_BIND. - */ nd->flags |= LOOKUP_PARENT; nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); - error = __do_follow_link(&link, nd, &cookie); + error = follow_link(&link, nd, &cookie); if (unlikely(error)) filp = ERR_PTR(error); else filp = do_last(nd, &path, op, pathname); - if (!IS_ERR(cookie) && linki->i_op->put_link) - linki->i_op->put_link(link.dentry, nd, cookie); - path_put(&link); + put_link(nd, &link, cookie); } out: if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) From c826cb7dfce80512c26c984350077a25046bd215 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 15 Mar 2011 15:29:21 -0700 Subject: [PATCH 701/706] dcache.c: create helper function for duplicated functionality This creates a helper function for he "try to ascend into the parent directory" case, which was written out in triplicate before. With all the locking and subtle sequence number stuff, we really don't want to duplicate that kind of code. Signed-off-by: Linus Torvalds --- fs/dcache.c | 88 ++++++++++++++++++++++------------------------------- 1 file changed, 37 insertions(+), 51 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 611ffe928c03..361882a14ccb 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1011,6 +1011,34 @@ void shrink_dcache_for_umount(struct super_block *sb) } } +/* + * This tries to ascend one level of parenthood, but + * we can race with renaming, so we need to re-check + * the parenthood after dropping the lock and check + * that the sequence number still matches. + */ +static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq) +{ + struct dentry *new = old->d_parent; + + rcu_read_lock(); + spin_unlock(&old->d_lock); + spin_lock(&new->d_lock); + + /* + * might go back up the wrong parent if we have had a rename + * or deletion + */ + if (new != old->d_parent || + (!locked && read_seqretry(&rename_lock, seq))) { + spin_unlock(&new->d_lock); + new = NULL; + } + rcu_read_unlock(); + return new; +} + + /* * Search for at least 1 mount point in the dentry's subdirs. * We descend to the next level whenever the d_subdirs @@ -1066,24 +1094,10 @@ resume: * All done at this level ... ascend and resume the search. */ if (this_parent != parent) { - struct dentry *tmp; - struct dentry *child; - - tmp = this_parent->d_parent; - rcu_read_lock(); - spin_unlock(&this_parent->d_lock); - child = this_parent; - this_parent = tmp; - spin_lock(&this_parent->d_lock); - /* might go back up the wrong parent if we have had a rename - * or deletion */ - if (this_parent != child->d_parent || - (!locked && read_seqretry(&rename_lock, seq))) { - spin_unlock(&this_parent->d_lock); - rcu_read_unlock(); + struct dentry *child = this_parent; + this_parent = try_to_ascend(this_parent, locked, seq); + if (!this_parent) goto rename_retry; - } - rcu_read_unlock(); next = child->d_u.d_child.next; goto resume; } @@ -1181,24 +1195,10 @@ resume: * All done at this level ... ascend and resume the search. */ if (this_parent != parent) { - struct dentry *tmp; - struct dentry *child; - - tmp = this_parent->d_parent; - rcu_read_lock(); - spin_unlock(&this_parent->d_lock); - child = this_parent; - this_parent = tmp; - spin_lock(&this_parent->d_lock); - /* might go back up the wrong parent if we have had a rename - * or deletion */ - if (this_parent != child->d_parent || - (!locked && read_seqretry(&rename_lock, seq))) { - spin_unlock(&this_parent->d_lock); - rcu_read_unlock(); + struct dentry *child = this_parent; + this_parent = try_to_ascend(this_parent, locked, seq); + if (!this_parent) goto rename_retry; - } - rcu_read_unlock(); next = child->d_u.d_child.next; goto resume; } @@ -2942,28 +2942,14 @@ resume: spin_unlock(&dentry->d_lock); } if (this_parent != root) { - struct dentry *tmp; - struct dentry *child; - - tmp = this_parent->d_parent; + struct dentry *child = this_parent; if (!(this_parent->d_flags & DCACHE_GENOCIDE)) { this_parent->d_flags |= DCACHE_GENOCIDE; this_parent->d_count--; } - rcu_read_lock(); - spin_unlock(&this_parent->d_lock); - child = this_parent; - this_parent = tmp; - spin_lock(&this_parent->d_lock); - /* might go back up the wrong parent if we have had a rename - * or deletion */ - if (this_parent != child->d_parent || - (!locked && read_seqretry(&rename_lock, seq))) { - spin_unlock(&this_parent->d_lock); - rcu_read_unlock(); + this_parent = try_to_ascend(this_parent, locked, seq); + if (!this_parent) goto rename_retry; - } - rcu_read_unlock(); next = child->d_u.d_child.next; goto resume; } From c83ce989cb5ff86575821992ea82c4df5c388ebc Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 15 Mar 2011 13:36:43 -0400 Subject: [PATCH 702/706] VFS: Fix the nfs sillyrename regression in kernel 2.6.38 The new vfs locking scheme introduced in 2.6.38 breaks NFS sillyrename because the latter relies on being able to determine the parent directory of the dentry in the ->iput() callback in order to send the appropriate unlink rpc call. Looking at the code that cares about races with dput(), there doesn't seem to be anything that specifically uses d_parent as a test for whether or not there is a race: - __d_lookup_rcu(), __d_lookup() all test for d_hashed() after d_parent - shrink_dcache_for_umount() is safe since nothing else can rearrange the dentries in that super block. - have_submount(), select_parent() and d_genocide() can test for a deletion if we set the DCACHE_DISCONNECTED flag when the dentry is removed from the parent's d_subdirs list. Signed-off-by: Trond Myklebust Cc: stable@kernel.org (2.6.38, needs commit c826cb7dfce8 "dcache.c: create helper function for duplicated functionality" ) Signed-off-by: Linus Torvalds --- fs/dcache.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/dcache.c b/fs/dcache.c index 361882a14ccb..a39fe47c466f 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -296,8 +296,12 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent) __releases(parent->d_lock) __releases(dentry->d_inode->i_lock) { - dentry->d_parent = NULL; list_del(&dentry->d_u.d_child); + /* + * Inform try_to_ascend() that we are no longer attached to the + * dentry tree + */ + dentry->d_flags |= DCACHE_DISCONNECTED; if (parent) spin_unlock(&parent->d_lock); dentry_iput(dentry); @@ -1030,6 +1034,7 @@ static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq * or deletion */ if (new != old->d_parent || + (old->d_flags & DCACHE_DISCONNECTED) || (!locked && read_seqretry(&rename_lock, seq))) { spin_unlock(&new->d_lock); new = NULL; From 0e794589e588a88d34e339feee50c72606fb21a7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 16 Mar 2011 02:45:02 -0400 Subject: [PATCH 703/706] fix follow_link() breakage commit 574197e0de46a8a4db5c54ef7b65e43ffa8873a7 had a missing piece, breaking the loop detection ;-/ Signed-off-by: Al Viro --- fs/namei.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 0a601cae23de..b912b7abe747 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -753,9 +753,11 @@ follow_link(struct path *link, struct nameidata *nd, void **p) BUG_ON(nd->flags & LOOKUP_RCU); + if (link->mnt == nd->path.mnt) + mntget(link->mnt); + if (unlikely(current->total_link_count >= 40)) { *p = ERR_PTR(-ELOOP); /* no ->put_link(), please */ - path_put_conditional(link, nd); path_put(&nd->path); return -ELOOP; } @@ -765,9 +767,6 @@ follow_link(struct path *link, struct nameidata *nd, void **p) touch_atime(link->mnt, dentry); nd_set_link(nd, NULL); - if (link->mnt == nd->path.mnt) - mntget(link->mnt); - error = security_inode_follow_link(link->dentry, nd); if (error) { *p = ERR_PTR(error); /* no ->put_link(), please */ From 5229645bdc35f1cc43eb8b25b6993c8fa58b4b43 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Wed, 16 Mar 2011 18:09:27 +1100 Subject: [PATCH 704/706] vfs: add nonconflicting values for O_PATH [AV: on architectures where default conflicts with existing flags, that is] Signed-off-by: Stephen Rothwell Signed-off-by: Al Viro --- arch/alpha/include/asm/fcntl.h | 2 ++ arch/parisc/include/asm/fcntl.h | 2 ++ arch/sparc/include/asm/fcntl.h | 2 ++ 3 files changed, 6 insertions(+) diff --git a/arch/alpha/include/asm/fcntl.h b/arch/alpha/include/asm/fcntl.h index 70145cbb21cb..1b71ca70c9f6 100644 --- a/arch/alpha/include/asm/fcntl.h +++ b/arch/alpha/include/asm/fcntl.h @@ -31,6 +31,8 @@ #define __O_SYNC 020000000 #define O_SYNC (__O_SYNC|O_DSYNC) +#define O_PATH 040000000 + #define F_GETLK 7 #define F_SETLK 8 #define F_SETLKW 9 diff --git a/arch/parisc/include/asm/fcntl.h b/arch/parisc/include/asm/fcntl.h index f357fc693c89..0304b92ccfea 100644 --- a/arch/parisc/include/asm/fcntl.h +++ b/arch/parisc/include/asm/fcntl.h @@ -19,6 +19,8 @@ #define O_NOFOLLOW 000000200 /* don't follow links */ #define O_INVISIBLE 004000000 /* invisible I/O, for DMAPI/XDSM */ +#define O_PATH 020000000 + #define F_GETLK64 8 #define F_SETLK64 9 #define F_SETLKW64 10 diff --git a/arch/sparc/include/asm/fcntl.h b/arch/sparc/include/asm/fcntl.h index 38f37b333cc7..d0b83f66f356 100644 --- a/arch/sparc/include/asm/fcntl.h +++ b/arch/sparc/include/asm/fcntl.h @@ -34,6 +34,8 @@ #define __O_SYNC 0x800000 #define O_SYNC (__O_SYNC|O_DSYNC) +#define O_PATH 0x1000000 + #define F_GETOWN 5 /* for sockets. */ #define F_SETOWN 6 /* for sockets. */ #define F_GETLK 7 From bab1d9444d9a147f1dc3478dd06c16f490227f3e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Mar 2011 21:51:24 +0100 Subject: [PATCH 705/706] prune back iprune_sem iprune_sem is continously giving us lockdep warnings because we do take it in read mode in the reclaim path, but we're also doing non-NOFS allocations under it taken in write mode. Taking a bit deeper look at it I think it's fixable quite trivially: - for invalidate_inodes we do not need iprune_sem at all. We have an active reference on the superblock, so the filesystem is not going away until it has finished. - for evict_inodes we do need it, to make sure prune_icache has done it's work before we tear down the superblock. But there is no reason to hold it over the actual reclaim operation - it's enough to cycle through it after the actual reclaim to make sure we wait for any pending prune_icache to complete. We just have to remove the WARN_ON for otherwise busy inodes as they can actually happen now. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/inode.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 0647d80accf6..9910c039f026 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -84,16 +84,13 @@ static struct hlist_head *inode_hashtable __read_mostly; DEFINE_SPINLOCK(inode_lock); /* - * iprune_sem provides exclusion between the kswapd or try_to_free_pages - * icache shrinking path, and the umount path. Without this exclusion, - * by the time prune_icache calls iput for the inode whose pages it has - * been invalidating, or by the time it calls clear_inode & destroy_inode - * from its final dispose_list, the struct super_block they refer to - * (for inode->i_sb->s_op) may already have been freed and reused. + * iprune_sem provides exclusion between the icache shrinking and the + * umount path. * - * We make this an rwsem because the fastpath is icache shrinking. In - * some cases a filesystem may be doing a significant amount of work in - * its inode reclaim code, so this should improve parallelism. + * We don't actually need it to protect anything in the umount path, + * but only need to cycle through it to make sure any inode that + * prune_icache took off the LRU list has been fully torn down by the + * time we are past evict_inodes. */ static DECLARE_RWSEM(iprune_sem); @@ -516,17 +513,12 @@ void evict_inodes(struct super_block *sb) struct inode *inode, *next; LIST_HEAD(dispose); - down_write(&iprune_sem); - spin_lock(&inode_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { if (atomic_read(&inode->i_count)) continue; - - if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { - WARN_ON(1); + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) continue; - } inode->i_state |= I_FREEING; @@ -542,6 +534,13 @@ void evict_inodes(struct super_block *sb) spin_unlock(&inode_lock); dispose_list(&dispose); + + /* + * Cycle through iprune_sem to make sure any inode that prune_icache + * moved off the list before we took the lock has been fully torn + * down. + */ + down_write(&iprune_sem); up_write(&iprune_sem); } @@ -561,8 +560,6 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) struct inode *inode, *next; LIST_HEAD(dispose); - down_write(&iprune_sem); - spin_lock(&inode_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) @@ -590,7 +587,6 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) spin_unlock(&inode_lock); dispose_list(&dispose); - up_write(&iprune_sem); return busy; } From 34d211a2d5df4984a35b18d8ccacbe1d10abb067 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 16 Mar 2011 08:04:07 -0700 Subject: [PATCH 706/706] Increase OSF partition limit from 8 to 18 It turns out that while a maximum of 8 partitions may be what people "should" have had, you can actually fit up to 18 entries(*) in a sector. And some people clearly were taking advantage of that, like Michael Cree, who had ten partitions on one of his OSF disks. (*) The OSF partition data starts at byte offset 64 in the first sector, and the array of 16-byte partition entries start at offset 148 in the on-disk partition structure. Reported-by: Michael Cree Cc: stable@kernel.org (v2.6.38) Signed-off-by: Linus Torvalds --- fs/partitions/osf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c index be03a0b08b47..764b86a01965 100644 --- a/fs/partitions/osf.c +++ b/fs/partitions/osf.c @@ -10,7 +10,7 @@ #include "check.h" #include "osf.h" -#define MAX_OSF_PARTITIONS 8 +#define MAX_OSF_PARTITIONS 18 int osf_partition(struct parsed_partitions *state) {