From 3f6316b4378e64e96acfe55b499648dc0e6b9603 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 29 Aug 2014 15:18:26 -0700 Subject: [PATCH 01/22] checkpatch: relax check for length of git commit IDs Checkpatch currently warns if a git commit ID (in the changelog, usually) is less than 12 characters or more than 16. The "more than 16" is excessive. Change the check so we accept IDs from 12 to 40 chars in length. Cc: Geert Uytterhoeven Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 31a731e06f50..b385bcbbf2f5 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2133,7 +2133,7 @@ sub process { # Check for improperly formed commit descriptions if ($in_commit_log && $line =~ /\bcommit\s+[0-9a-f]{5,}/i && - $line !~ /\b[Cc]ommit [0-9a-f]{12,16} \("/) { + $line !~ /\b[Cc]ommit [0-9a-f]{12,40} \("/) { $line =~ /\b(c)ommit\s+([0-9a-f]{5,})/i; my $init_char = $1; my $orig_commit = lc($2); @@ -2141,7 +2141,7 @@ sub process { my $desc = 'commit description'; ($id, $desc) = git_commit_info($orig_commit, $id, $desc); ERROR("GIT_COMMIT_ID", - "Please use 12 to 16 chars for the git commit ID like: '${init_char}ommit $id (\"$desc\")'\n" . $herecurr); + "Please use 12 or more chars for the git commit ID like: '${init_char}ommit $id (\"$desc\")'\n" . $herecurr); } # Check for added, moved or deleted files From 800df627e2eabaf4a921d342a1d5162c843b7fc2 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 29 Aug 2014 15:18:29 -0700 Subject: [PATCH 02/22] resource: fix the case of null pointer access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Richard and Daniel reported that UML is broken due to changes to resource traversal functions. Problem is that iomem_resource.child can be null and new code does not consider that possibility. Old code used a for loop and that loop will not even execute if p was null. Revert back to for() loop logic and bail out if p is null. I also moved sibling_only check out of resource_lock. There is no reason to keep it inside the lock. Following is backtrace of the UML crash. RIP: 0033:[<0000000060039b9f>] RSP: 0000000081459da0 EFLAGS: 00010202 RAX: 0000000000000000 RBX: 00000000219b3fff RCX: 000000006010d1d9 RDX: 0000000000000001 RSI: 00000000602dfb94 RDI: 0000000081459df8 RBP: 0000000081459de0 R08: 00000000601b59f4 R09: ffffffff0000ff00 R10: ffffffff0000ff00 R11: 0000000081459e88 R12: 0000000081459df8 R13: 00000000219b3fff R14: 00000000602dfb94 R15: 0000000000000000 Kernel panic - not syncing: Segfault with no mm CPU: 0 PID: 1 Comm: swapper Not tainted 3.16.0-10454-g58d08e3 #13 Stack: 00000000 000080d0 81459df0 219b3fff 81459e70 6010d1d9 ffffffff 6033e010 81459e50 6003a269 81459e30 00000000 Call Trace: [<6010d1d9>] ? kclist_add_private+0x0/0xe7 [<6003a269>] walk_system_ram_range+0x61/0xb7 [<6000e859>] ? proc_kcore_init+0x0/0xf1 [<6010d574>] kcore_update_ram+0x4c/0x168 [<6010d72e>] ? kclist_add+0x0/0x2e [<6000e943>] proc_kcore_init+0xea/0xf1 [<6000e859>] ? proc_kcore_init+0x0/0xf1 [<6000e859>] ? proc_kcore_init+0x0/0xf1 [<600189f0>] do_one_initcall+0x13c/0x204 [<6004ca46>] ? parse_args+0x1df/0x2e0 [<6004c82d>] ? parameq+0x0/0x3a [<601b5990>] ? strcpy+0x0/0x18 [<60001e1a>] kernel_init_freeable+0x240/0x31e [<6026f1c0>] kernel_init+0x12/0x148 [<60019fad>] new_thread_handler+0x81/0xa3 Fixes 8c86e70acead629aacb4a ("resource: provide new functions to walk through resources"). Reported-by: Daniel Walter Tested-by: Richard Weinberger Tested-by: Toralf Förster Tested-by: Daniel Walter Signed-off-by: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index da14b8d09296..60c5a3856ab7 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -351,15 +351,12 @@ static int find_next_iomem_res(struct resource *res, char *name, end = res->end; BUG_ON(start >= end); + if (first_level_children_only) + sibling_only = true; + read_lock(&resource_lock); - if (first_level_children_only) { - p = iomem_resource.child; - sibling_only = true; - } else - p = &iomem_resource; - - while ((p = next_resource(p, sibling_only))) { + for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) { if (p->flags != res->flags) continue; if (name && strcmp(p->name, name)) From 0cfb8f0c3e21e36d4a6e472e4c419d58ba848698 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Fri, 29 Aug 2014 15:18:31 -0700 Subject: [PATCH 03/22] memblock, memhotplug: fix wrong type in memblock_find_in_range_node(). In memblock_find_in_range_node(), we defined ret as int. But it should be phys_addr_t because it is used to store the return value from __memblock_find_range_bottom_up(). The bug has not been triggered because when allocating low memory near the kernel end, the "int ret" won't turn out to be negative. When we started to allocate memory on other nodes, and the "int ret" could be minus. Then the kernel will panic. A simple way to reproduce this: comment out the following code in numa_init(), memblock_set_bottom_up(false); and the kernel won't boot. Reported-by: Xishi Qiu Signed-off-by: Tang Chen Tested-by: Xishi Qiu Cc: [3.13+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/memblock.c b/mm/memblock.c index 6d2f219a48b0..70fad0c0dafb 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -192,8 +192,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, phys_addr_t start, phys_addr_t end, int nid) { - int ret; - phys_addr_t kernel_end; + phys_addr_t kernel_end, ret; /* pump up @end */ if (end == MEMBLOCK_ALLOC_ACCESSIBLE) From ce8369bcbeeea6dfe24a6c8f60d2fcfce0432830 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 29 Aug 2014 15:18:33 -0700 Subject: [PATCH 04/22] mm: actually clear pmd_numa before invalidating Commit 67f87463d3a3 ("mm: clear pmd_numa before invalidating") cleared the NUMA bit in a copy of the PMD entry, but then wrote back the original Signed-off-by: Matthew Wilcox Acked-by: Mel Gorman Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pgtable-generic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index a8b919925934..dfb79e028ecb 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -195,7 +195,7 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t entry = *pmdp; if (pmd_numa(entry)) entry = pmd_mknonnuma(entry); - set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp)); + set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry)); flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ From 0c38e1fe0fced6aa06dbf444f7203dd7f325e369 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Fri, 29 Aug 2014 15:18:35 -0700 Subject: [PATCH 05/22] lib: turn CONFIG_STACKTRACE into an actual option. I was puzzled why /proc/$$/stack had disappeared, until I figured out I had disabled the last debug option that did a 'select STACKTRACE'. This patch makes the option show up at config time, so it can be enabled without enabling any of the more heavyweight debug options. Signed-off-by: Dave Jones Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 1b233fc67466..a28590083622 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1036,8 +1036,13 @@ config TRACE_IRQFLAGS either tracing or lock debugging. config STACKTRACE - bool + bool "Stack backtrace support" depends on STACKTRACE_SUPPORT + help + This option causes the kernel to create a /proc/pid/stack for + every process, showing its current stack trace. + It is also used by various kernel debugging features that require + stack trace generation. config DEBUG_KOBJECT bool "kobject debugging" From 0cf1e9d6c34d4c82ac3af8015594849814843d36 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 29 Aug 2014 15:18:37 -0700 Subject: [PATCH 06/22] zram: fix incorrect stat with failed_reads Since we allocate a temporary buffer in zram_bvec_read to handle partial page operations in commit 924bd88d703e ("Staging: zram: allow partial page operations"), our ->failed_reads value may be incorrect as we do not increase its value when failing to allocate the temporary buffer. Let's fix this issue and correct the annotation of failed_reads. Signed-off-by: Chao Yu Acked-by: Minchan Kim Cc: Nitin Gupta Acked-by: Jerome Marchand Acked-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 10 +++++++--- drivers/block/zram/zram_drv.h | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index dfa4024c448a..d00831c3d731 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -378,7 +378,6 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) /* Should NEVER happen. Return bio error if it does. */ if (unlikely(ret)) { pr_err("Decompression failed! err=%d, page=%u\n", ret, index); - atomic64_inc(&zram->stats.failed_reads); return ret; } @@ -547,8 +546,6 @@ out: zcomp_strm_release(zram->comp, zstrm); if (is_partial_io(bvec)) kfree(uncmem); - if (ret) - atomic64_inc(&zram->stats.failed_writes); return ret; } @@ -566,6 +563,13 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, ret = zram_bvec_write(zram, bvec, index, offset); } + if (unlikely(ret)) { + if (rw == READ) + atomic64_inc(&zram->stats.failed_reads); + else + atomic64_inc(&zram->stats.failed_writes); + } + return ret; } diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 5b0afde729cd..e0f725c87cc6 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -84,7 +84,7 @@ struct zram_stats { atomic64_t compr_data_size; /* compressed size of pages stored */ atomic64_t num_reads; /* failed + successful */ atomic64_t num_writes; /* --do-- */ - atomic64_t failed_reads; /* should NEVER! happen */ + atomic64_t failed_reads; /* can happen when memory is too low */ atomic64_t failed_writes; /* can happen when memory is too low */ atomic64_t invalid_io; /* non-page-aligned I/O requests */ atomic64_t notify_free; /* no. of swap slot free notifications */ From 137f8cff505ace6251dc442c7aa973d60c801a79 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 29 Aug 2014 15:18:40 -0700 Subject: [PATCH 07/22] mm/zpool: use prefixed module loading To avoid potential format string expansion via module parameters, do not use the zpool type directly in request_module() without a format string. Additionally, to avoid arbitrary modules being loaded via zpool API (e.g. via the zswap_zpool_type module parameter) add a "zpool-" prefix to the requested module, as well as module aliases for the existing zpool types (zbud and zsmalloc). Signed-off-by: Kees Cook Cc: Seth Jennings Cc: Minchan Kim Cc: Nitin Gupta Acked-by: Dan Streetman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zbud.c | 1 + mm/zpool.c | 2 +- mm/zsmalloc.c | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/zbud.c b/mm/zbud.c index a05790b1915e..f26e7fcc7fa2 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -195,6 +195,7 @@ static struct zpool_driver zbud_zpool_driver = { .total_size = zbud_zpool_total_size, }; +MODULE_ALIAS("zpool-zbud"); #endif /* CONFIG_ZPOOL */ /***************** diff --git a/mm/zpool.c b/mm/zpool.c index e40612a1df00..739cdf0d183a 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -150,7 +150,7 @@ struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops) driver = zpool_get_driver(type); if (!driver) { - request_module(type); + request_module("zpool-%s", type); driver = zpool_get_driver(type); } diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 4e2fc83cb394..94f38fac5e81 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -315,6 +315,7 @@ static struct zpool_driver zs_zpool_driver = { .total_size = zs_zpool_total_size, }; +MODULE_ALIAS("zpool-zsmalloc"); #endif /* CONFIG_ZPOOL */ /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ From 7ea8574e5fa31f43d8098a028f12ba6a9c9f3530 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 29 Aug 2014 15:18:42 -0700 Subject: [PATCH 08/22] hugetlb_cgroup: use lockdep_assert_held rather than spin_is_locked spin_lock may be an empty struct for !SMP configurations and so arch_spin_is_locked may return unconditional 0 and trigger the VM_BUG_ON even when the lock is held. Replace spin_is_locked by lockdep_assert_held. We will not BUG anymore but it is questionable whether crashing makes a lot of sense in the uncharge path. Uncharge happens after the last page reference was released so nobody should touch the page and the function doesn't update any shared state except for res counter which uses synchronization of its own. Signed-off-by: Michal Hocko Reviewed-by: Aneesh Kumar K.V Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb_cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 9eebfadeeee1..a67c26e0f360 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -217,7 +217,7 @@ void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, if (hugetlb_cgroup_disabled()) return; - VM_BUG_ON(!spin_is_locked(&hugetlb_lock)); + lockdep_assert_held(&hugetlb_lock); h_cg = hugetlb_cgroup_from_page(page); if (unlikely(!h_cg)) return; From b38af4721f59d0b564468f623b3e52a638195015 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 29 Aug 2014 15:18:44 -0700 Subject: [PATCH 09/22] x86,mm: fix pte_special versus pte_numa Sasha Levin has shown oopses on ffffea0003480048 and ffffea0003480008 at mm/memory.c:1132, running Trinity on different 3.16-rc-next kernels: where zap_pte_range() checks page->mapping to see if PageAnon(page). Those addresses fit struct pages for pfns d2001 and d2000, and in each dump a register or a stack slot showed d2001730 or d2000730: pte flags 0x730 are PCD ACCESSED PROTNONE SPECIAL IOMAP; and Sasha's e820 map has a hole between cfffffff and 100000000, which would need special access. Commit c46a7c817e66 ("x86: define _PAGE_NUMA by reusing software bits on the PMD and PTE levels") has broken vm_normal_page(): a PROTNONE SPECIAL pte no longer passes the pte_special() test, so zap_pte_range() goes on to try to access a non-existent struct page. Fix this by refining pte_special() (SPECIAL with PRESENT or PROTNONE) to complement pte_numa() (SPECIAL with neither PRESENT nor PROTNONE). A hint that this was a problem was that c46a7c817e66 added pte_numa() test to vm_normal_page(), and moved its is_zero_pfn() test from slow to fast path: This was papering over a pte_special() snag when the zero page was encountered during zap. This patch reverts vm_normal_page() to how it was before, relying on pte_special(). It still appears that this patch may be incomplete: aren't there other places which need to be handling PROTNONE along with PRESENT? For example, pte_mknuma() clears _PAGE_PRESENT and sets _PAGE_NUMA, but on a PROT_NONE area, that would make it pte_special(). This is side-stepped by the fact that NUMA hinting faults skipped PROT_NONE VMAs and there are no grounds where a NUMA hinting fault on a PROT_NONE VMA would be interesting. Fixes: c46a7c817e66 ("x86: define _PAGE_NUMA by reusing software bits on the PMD and PTE levels") Reported-by: Sasha Levin Tested-by: Sasha Levin Signed-off-by: Hugh Dickins Signed-off-by: Mel Gorman Cc: "Kirill A. Shutemov" Cc: Peter Zijlstra Cc: Rik van Riel Cc: Johannes Weiner Cc: Cyrill Gorcunov Cc: Matthew Wilcox Cc: [3.16] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable.h | 9 +++++++-- mm/memory.c | 7 +++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 0ec056012618..aa97a070f09f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -131,8 +131,13 @@ static inline int pte_exec(pte_t pte) static inline int pte_special(pte_t pte) { - return (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_SPECIAL)) == - (_PAGE_PRESENT|_PAGE_SPECIAL); + /* + * See CONFIG_NUMA_BALANCING pte_numa in include/asm-generic/pgtable.h. + * On x86 we have _PAGE_BIT_NUMA == _PAGE_BIT_GLOBAL+1 == + * __PAGE_BIT_SOFTW1 == _PAGE_BIT_SPECIAL. + */ + return (pte_flags(pte) & _PAGE_SPECIAL) && + (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_PROTNONE)); } static inline unsigned long pte_pfn(pte_t pte) diff --git a/mm/memory.c b/mm/memory.c index ab3537bcfed2..adeac306610f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -751,7 +751,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn = pte_pfn(pte); if (HAVE_PTE_SPECIAL) { - if (likely(!pte_special(pte) || pte_numa(pte))) + if (likely(!pte_special(pte))) goto check_pfn; if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return NULL; @@ -777,15 +777,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, } } + if (is_zero_pfn(pfn)) + return NULL; check_pfn: if (unlikely(pfn > highest_memmap_pfn)) { print_bad_pte(vma, addr, pte, NULL); return NULL; } - if (is_zero_pfn(pfn)) - return NULL; - /* * NOTE! We still have PageReserved() pages in the page tables. * eg. VDSO mappings can cause them to exist. From 74ca317c26a3f8543203b61d262c0ab2e30c384e Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 29 Aug 2014 15:18:46 -0700 Subject: [PATCH 10/22] kexec: create a new config option CONFIG_KEXEC_FILE for new syscall Currently new system call kexec_file_load() and all the associated code compiles if CONFIG_KEXEC=y. But new syscall also compiles purgatory code which currently uses gcc option -mcmodel=large. This option seems to be available only gcc 4.4 onwards. Hiding new functionality behind a new config option will not break existing users of old gcc. Those who wish to enable new functionality will require new gcc. Having said that, I am trying to figure out how can I move away from using -mcmodel=large but that can take a while. I think there are other advantages of introducing this new config option. As this option will be enabled only on x86_64, other arches don't have to compile generic kexec code which will never be used. This new code selects CRYPTO=y and CRYPTO_SHA256=y. And all other arches had to do this for CONFIG_KEXEC. Now with introduction of new config option, we can remove crypto dependency from other arches. Now CONFIG_KEXEC_FILE is available only on x86_64. So whereever I had CONFIG_X86_64 defined, I got rid of that. For CONFIG_KEXEC_FILE, instead of doing select CRYPTO=y, I changed it to "depends on CRYPTO=y". This should be safer as "select" is not recursive. Signed-off-by: Vivek Goyal Cc: Eric Biederman Cc: H. Peter Anvin Tested-by: Shaun Ruffell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kbuild | 4 +--- arch/x86/Kconfig | 18 ++++++++++++++---- arch/x86/Makefile | 5 +---- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/crash.c | 6 ++---- arch/x86/kernel/machine_kexec_64.c | 11 +++++++++++ arch/x86/purgatory/Makefile | 5 +---- kernel/kexec.c | 11 +++++++++++ 8 files changed, 42 insertions(+), 20 deletions(-) diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index 61b6d51866f8..3942f74c92d7 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -17,6 +17,4 @@ obj-$(CONFIG_IA32_EMULATION) += ia32/ obj-y += platform/ obj-y += net/ -ifeq ($(CONFIG_X86_64),y) -obj-$(CONFIG_KEXEC) += purgatory/ -endif +obj-$(CONFIG_KEXEC_FILE) += purgatory/ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5d0bf1aa9dcb..778178f4c7d1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1585,9 +1585,6 @@ source kernel/Kconfig.hz config KEXEC bool "kexec system call" - select BUILD_BIN2C - select CRYPTO - select CRYPTO_SHA256 ---help--- kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot @@ -1602,9 +1599,22 @@ config KEXEC interface is strongly in flux, so no good recommendation can be made. +config KEXEC_FILE + bool "kexec file based system call" + select BUILD_BIN2C + depends on KEXEC + depends on X86_64 + depends on CRYPTO=y + depends on CRYPTO_SHA256=y + ---help--- + This is new version of kexec system call. This system call is + file based and takes file descriptors as system call argument + for kernel and initramfs as opposed to list of segments as + accepted by previous system call. + config KEXEC_VERIFY_SIG bool "Verify kernel signature during kexec_file_load() syscall" - depends on KEXEC + depends on KEXEC_FILE ---help--- This option makes kernel signature verification mandatory for kexec_file_load() syscall. If kernel is signature can not be diff --git a/arch/x86/Makefile b/arch/x86/Makefile index c1aa36887843..c96bcec544fc 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -184,11 +184,8 @@ archheaders: $(Q)$(MAKE) $(build)=arch/x86/syscalls all archprepare: -ifeq ($(CONFIG_KEXEC),y) -# Build only for 64bit. No loaders for 32bit yet. - ifeq ($(CONFIG_X86_64),y) +ifeq ($(CONFIG_KEXEC_FILE),y) $(Q)$(MAKE) $(build)=arch/x86/purgatory arch/x86/purgatory/kexec-purgatory.c - endif endif ### diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index b5ea75c4a4b4..ada2e2d6be3e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -71,6 +71,7 @@ obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_X86_TSC) += trace_clock.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o +obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-y += kprobes/ obj-$(CONFIG_MODULES) += module.o @@ -118,5 +119,4 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o obj-y += vsmp_64.o - obj-$(CONFIG_KEXEC) += kexec-bzimage64.o endif diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 0553a34fa0df..a618fcd2c07d 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -182,8 +182,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) crash_save_cpu(regs, safe_smp_processor_id()); } -#ifdef CONFIG_X86_64 - +#ifdef CONFIG_KEXEC_FILE static int get_nr_ram_ranges_callback(unsigned long start_pfn, unsigned long nr_pfn, void *arg) { @@ -696,5 +695,4 @@ int crash_load_segments(struct kimage *image) return ret; } - -#endif /* CONFIG_X86_64 */ +#endif /* CONFIG_KEXEC_FILE */ diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 8b04018e5d1f..485981059a40 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -25,9 +25,11 @@ #include #include +#ifdef CONFIG_KEXEC_FILE static struct kexec_file_ops *kexec_file_loaders[] = { &kexec_bzImage64_ops, }; +#endif static void free_transition_pgtable(struct kimage *image) { @@ -178,6 +180,7 @@ static void load_segments(void) ); } +#ifdef CONFIG_KEXEC_FILE /* Update purgatory as needed after various image segments have been prepared */ static int arch_update_purgatory(struct kimage *image) { @@ -209,6 +212,12 @@ static int arch_update_purgatory(struct kimage *image) return ret; } +#else /* !CONFIG_KEXEC_FILE */ +static inline int arch_update_purgatory(struct kimage *image) +{ + return 0; +} +#endif /* CONFIG_KEXEC_FILE */ int machine_kexec_prepare(struct kimage *image) { @@ -329,6 +338,7 @@ void arch_crash_save_vmcoreinfo(void) /* arch-dependent functionality related to kexec file-based syscall */ +#ifdef CONFIG_KEXEC_FILE int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, unsigned long buf_len) { @@ -522,3 +532,4 @@ overflow: (int)ELF64_R_TYPE(rel[i].r_info), value); return -ENOEXEC; } +#endif /* CONFIG_KEXEC_FILE */ diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 7fde9ee438a4..c4ae06e4ae74 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -24,7 +24,4 @@ $(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE $(call if_changed,bin2c) -# No loaders for 32bits yet. -ifeq ($(CONFIG_X86_64),y) - obj-$(CONFIG_KEXEC) += kexec-purgatory.o -endif +obj-$(CONFIG_KEXEC_FILE) += kexec-purgatory.o diff --git a/kernel/kexec.c b/kernel/kexec.c index 0b49a0a58102..2bee072268d9 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -64,7 +64,9 @@ bool kexec_in_progress = false; char __weak kexec_purgatory[0]; size_t __weak kexec_purgatory_size = 0; +#ifdef CONFIG_KEXEC_FILE static int kexec_calculate_store_digests(struct kimage *image); +#endif /* Location of the reserved area for the crash kernel */ struct resource crashk_res = { @@ -341,6 +343,7 @@ out_free_image: return ret; } +#ifdef CONFIG_KEXEC_FILE static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len) { struct fd f = fdget(fd); @@ -612,6 +615,9 @@ out_free_image: kfree(image); return ret; } +#else /* CONFIG_KEXEC_FILE */ +static inline void kimage_file_post_load_cleanup(struct kimage *image) { } +#endif /* CONFIG_KEXEC_FILE */ static int kimage_is_destination_range(struct kimage *image, unsigned long start, @@ -1375,6 +1381,7 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, } #endif +#ifdef CONFIG_KEXEC_FILE SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, unsigned long, cmdline_len, const char __user *, cmdline_ptr, unsigned long, flags) @@ -1451,6 +1458,8 @@ out: return ret; } +#endif /* CONFIG_KEXEC_FILE */ + void crash_kexec(struct pt_regs *regs) { /* Take the kexec_mutex here to prevent sys_kexec_load @@ -2006,6 +2015,7 @@ static int __init crash_save_vmcoreinfo_init(void) subsys_initcall(crash_save_vmcoreinfo_init); +#ifdef CONFIG_KEXEC_FILE static int __kexec_add_segment(struct kimage *image, char *buf, unsigned long bufsz, unsigned long mem, unsigned long memsz) @@ -2682,6 +2692,7 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, return 0; } +#endif /* CONFIG_KEXEC_FILE */ /* * Move into place and start executing a preloaded standalone From b41d34b46aa91e0c83300c38eebc0b8762705e82 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 29 Aug 2014 15:18:49 -0700 Subject: [PATCH 11/22] kexec: remove CONFIG_KEXEC dependency on crypto New system call depends on crypto. As it did not have a separate config option, CONFIG_KEXEC was modified to select CRYPTO and CRYPTO_SHA256. But now previous patch introduced a new config option for new syscall. So CONFIG_KEXEC does not require crypto. Remove that dependency. Signed-off-by: Vivek Goyal Cc: Eric Biederman Cc: H. Peter Anvin Cc: Shaun Ruffell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/Kconfig | 2 -- arch/ia64/Kconfig | 2 -- arch/m68k/Kconfig | 2 -- arch/mips/Kconfig | 2 -- arch/powerpc/Kconfig | 2 -- arch/s390/Kconfig | 2 -- arch/sh/Kconfig | 2 -- arch/tile/Kconfig | 2 -- 8 files changed, 16 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index c49a775937db..32cbbd565902 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1983,8 +1983,6 @@ config XIP_PHYS_ADDR config KEXEC bool "Kexec system call (EXPERIMENTAL)" depends on (!SMP || PM_SLEEP_SMP) - select CRYPTO - select CRYPTO_SHA256 help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 64aefb76bd69..c84c88bbbbd7 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -549,8 +549,6 @@ source "drivers/sn/Kconfig" config KEXEC bool "kexec system call" depends on !IA64_HP_SIM && (!SMP || HOTPLUG_CPU) - select CRYPTO - select CRYPTO_SHA256 help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 3ff8c9a25335..87b7c7581b1d 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -91,8 +91,6 @@ config MMU_SUN3 config KEXEC bool "kexec system call" depends on M68KCLASSIC - select CRYPTO - select CRYPTO_SHA256 help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index df51e78a72cc..900c7e5333b6 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2396,8 +2396,6 @@ source "kernel/Kconfig.preempt" config KEXEC bool "Kexec system call" - select CRYPTO - select CRYPTO_SHA256 help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index a577609f8ed6..4bc7b62fb4b6 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -399,8 +399,6 @@ config PPC64_SUPPORTS_MEMORY_FAILURE config KEXEC bool "kexec system call" depends on (PPC_BOOK3S || FSL_BOOKE || (44x && !SMP)) - select CRYPTO - select CRYPTO_SHA256 help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index ab39ceb89ecf..05c78bb5f570 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -48,8 +48,6 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC config KEXEC def_bool y - select CRYPTO - select CRYPTO_SHA256 config AUDIT_ARCH def_bool y diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index b319846ad97f..244fb4c81e25 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -598,8 +598,6 @@ source kernel/Kconfig.hz config KEXEC bool "kexec system call (EXPERIMENTAL)" depends on SUPERH32 && MMU - select CRYPTO - select CRYPTO_SHA256 help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index a3ffe2dd4832..7fcd492adbfc 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -191,8 +191,6 @@ source "kernel/Kconfig.hz" config KEXEC bool "kexec system call" - select CRYPTO - select CRYPTO_SHA256 ---help--- kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot From bfcfd44cce2774f19daeb59fb4e43fc9aa80e7b8 Mon Sep 17 00:00:00 2001 From: Filipe Brandenburger Date: Fri, 29 Aug 2014 15:18:51 -0700 Subject: [PATCH 12/22] xattr: fix check for simultaneous glibc header inclusion The guard was introduced in commit ea1a8217b06b ("xattr: guard against simultaneous glibc header inclusion") but it is using #ifdef to check for a define that is either set to 1 or 0. Fix it to use #if instead. * Without this patch: $ { echo "#include "; echo "#include "; } | gcc -E -Iinclude/uapi - >/dev/null include/uapi/linux/xattr.h:19:0: warning: "XATTR_CREATE" redefined [enabled by default] #define XATTR_CREATE 0x1 /* set value, fail if attr already exists */ ^ /usr/include/x86_64-linux-gnu/sys/xattr.h:32:0: note: this is the location of the previous definition #define XATTR_CREATE XATTR_CREATE ^ * With this patch: $ { echo "#include "; echo "#include "; } | gcc -E -Iinclude/uapi - >/dev/null (no warnings) Signed-off-by: Filipe Brandenburger Acked-by: Serge E. Hallyn Cc: Allan McRae Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/xattr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/xattr.h b/include/uapi/linux/xattr.h index c38355c1f3c9..1590c49cae57 100644 --- a/include/uapi/linux/xattr.h +++ b/include/uapi/linux/xattr.h @@ -13,7 +13,7 @@ #ifndef _UAPI_LINUX_XATTR_H #define _UAPI_LINUX_XATTR_H -#ifdef __UAPI_DEF_XATTR +#if __UAPI_DEF_XATTR #define __USE_KERNEL_XATTR_DEFS #define XATTR_CREATE 0x1 /* set value, fail if attr already exists */ From b7d5b9a9686674eedffe5b8745c85265f3fe3367 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Fri, 29 Aug 2014 15:18:53 -0700 Subject: [PATCH 13/22] drivers/rtc/rtc-s5m.c: re-add support for devices without irq specified The rtc-s5m driver used to support devices without irq specified in the past. Re-add this support. The patch fixes boot for Insignal's Exynos4412 based Origen board. Error messages before the patch: ... Unable to handle kernel NULL pointer dereference at virtual address 00000094 pgd = c0004000 [00000094] *pgd=00000000 Internal error: Oops: 5 [#1] PREEMPT SMP ARM Modules linked in: CPU: 1 PID: 1 Comm: swapper/0 Not tainted 3.16.0-next-20140804-00008-ga59480f-dirty #701 task: ea80f000 ti: ea882000 task.ti: ea882000 PC is at regmap_irq_get_virq+0x0/0x28 LR is at s5m_rtc_probe+0xdc/0x310 pc : [] lr : [] psr: 80000153 sp : ea883e48 ip : 00000000 fp : 00000000 r10: 0000000c r9 : c05de7ac r8 : eaabc600 r7 : eaa6b4d0 r6 : c0439e8c r5 : eaabc610 r4 : eab30e50 r3 : 00000000 r2 : 00000000 r1 : 0000000c r0 : 00000000 Flags: Nzcv IRQs on FIQs off Mode SVC_32 ISA ARM Segment kernel Control: 10c5387d Table: 4000404a DAC: 00000015 Process swapper/0 (pid: 1, stack limit = 0xea882240) Backtrace: regmap_irq_get_virq s5m_rtc_probe platform_drv_probe driver_probe_device __driver_attach bus_for_each_dev bus_add_driver driver_register do_one_initcall kernel_init_freeable kernel_init ---[ end trace a954d7f019122700 ]--- Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b ... Signed-off-by: Bartlomiej Zolnierkiewicz Reviewed-by: Krzysztof Kozlowski Tested-by: Krzysztof Kozlowski Acked-by: Kyungmin Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-s5m.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/rtc/rtc-s5m.c b/drivers/rtc/rtc-s5m.c index 8f06250a0389..8754c33361e8 100644 --- a/drivers/rtc/rtc-s5m.c +++ b/drivers/rtc/rtc-s5m.c @@ -717,12 +717,14 @@ static int s5m_rtc_probe(struct platform_device *pdev) info->device_type = s5m87xx->device_type; info->wtsr_smpl = s5m87xx->wtsr_smpl; - info->irq = regmap_irq_get_virq(s5m87xx->irq_data, alarm_irq); - if (info->irq <= 0) { - ret = -EINVAL; - dev_err(&pdev->dev, "Failed to get virtual IRQ %d\n", + if (s5m87xx->irq_data) { + info->irq = regmap_irq_get_virq(s5m87xx->irq_data, alarm_irq); + if (info->irq <= 0) { + ret = -EINVAL; + dev_err(&pdev->dev, "Failed to get virtual IRQ %d\n", alarm_irq); - goto err; + goto err; + } } platform_set_drvdata(pdev, info); @@ -744,6 +746,11 @@ static int s5m_rtc_probe(struct platform_device *pdev) goto err; } + if (!info->irq) { + dev_info(&pdev->dev, "Alarm IRQ not available\n"); + return 0; + } + ret = devm_request_threaded_irq(&pdev->dev, info->irq, NULL, s5m_rtc_alarm_irq, 0, "rtc-alarm0", info); @@ -802,7 +809,7 @@ static int s5m_rtc_resume(struct device *dev) struct s5m_rtc_info *info = dev_get_drvdata(dev); int ret = 0; - if (device_may_wakeup(dev)) + if (info->irq && device_may_wakeup(dev)) ret = disable_irq_wake(info->irq); return ret; @@ -813,7 +820,7 @@ static int s5m_rtc_suspend(struct device *dev) struct s5m_rtc_info *info = dev_get_drvdata(dev); int ret = 0; - if (device_may_wakeup(dev)) + if (info->irq && device_may_wakeup(dev)) ret = enable_irq_wake(info->irq); return ret; From 4df4185a596c7301be0d8e1fdf691ec03e57cd23 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 29 Aug 2014 15:18:55 -0700 Subject: [PATCH 14/22] x86/purgatory: use approprate -m64/-32 build flag for arch/x86/purgatory Thomas reported that build of x86_64 kernel was failing for him. He is using 32bit tool chain. Problem is that while compiling purgatory, I have not specified -m64 flag. And 32bit tool chain must be assuming -m32 by default. Following is error message. (mini) [~/work/linux-2.6] make scripts/kconfig/conf --silentoldconfig Kconfig CHK include/config/kernel.release UPD include/config/kernel.release CHK include/generated/uapi/linux/version.h CHK include/generated/utsrelease.h UPD include/generated/utsrelease.h CC arch/x86/purgatory/purgatory.o arch/x86/purgatory/purgatory.c:1:0: error: code model 'large' not supported in the 32 bit mode Fix it by explicitly passing appropriate -m64/-m32 build flag for purgatory. Reported-by: Thomas Glanzmann Tested-by: Thomas Glanzmann Suggested-by: H. Peter Anvin Signed-off-by: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/purgatory/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index c4ae06e4ae74..899dd2454256 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -11,6 +11,7 @@ targets += purgatory.ro # sure how to relocate those. Like kexec-tools, use custom flags. KBUILD_CFLAGS := -fno-strict-aliasing -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fno-builtin -ffreestanding -c -MD -Os -mcmodel=large +KBUILD_CFLAGS += -m$(BITS) $(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE $(call if_changed,ld) From 2b462638e41ea62230297c21c4da9955937b7a3c Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Fri, 29 Aug 2014 15:18:58 -0700 Subject: [PATCH 15/22] ocfs2: do not write error flag to user structure we cannot copy from/to If we failed to copy from the structure, writing back the flags leaks 31 bits of kernel memory (the rest of the ir_flags field). In any case, if we cannot copy from/to the structure, why should we expect putting just the flags to work? Also make sure ocfs2_info_handle_freeinode() returns the right error code if the copy_to_user() fails. Fixes: ddee5cdb70e6 ('Ocfs2: Add new OCFS2_IOC_INFO ioctl for ocfs2 v8.') Signed-off-by: Ben Hutchings Cc: Joel Becker Acked-by: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/ioctl.c | 129 ++++++++++++++++------------------------------- 1 file changed, 43 insertions(+), 86 deletions(-) diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 6f66b3751ace..53e6c40ed4c6 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -35,9 +35,8 @@ copy_to_user((typeof(a) __user *)b, &(a), sizeof(a)) /* - * This call is void because we are already reporting an error that may - * be -EFAULT. The error will be returned from the ioctl(2) call. It's - * just a best-effort to tell userspace that this request caused the error. + * This is just a best-effort to tell userspace that this request + * caused the error. */ static inline void o2info_set_request_error(struct ocfs2_info_request *kreq, struct ocfs2_info_request __user *req) @@ -146,136 +145,105 @@ bail: static int ocfs2_info_handle_blocksize(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_blocksize oib; if (o2info_from_user(oib, req)) - goto bail; + return -EFAULT; oib.ib_blocksize = inode->i_sb->s_blocksize; o2info_set_request_filled(&oib.ib_req); if (o2info_to_user(oib, req)) - goto bail; + return -EFAULT; - status = 0; -bail: - if (status) - o2info_set_request_error(&oib.ib_req, req); - - return status; + return 0; } static int ocfs2_info_handle_clustersize(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_clustersize oic; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oic, req)) - goto bail; + return -EFAULT; oic.ic_clustersize = osb->s_clustersize; o2info_set_request_filled(&oic.ic_req); if (o2info_to_user(oic, req)) - goto bail; + return -EFAULT; - status = 0; -bail: - if (status) - o2info_set_request_error(&oic.ic_req, req); - - return status; + return 0; } static int ocfs2_info_handle_maxslots(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_maxslots oim; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oim, req)) - goto bail; + return -EFAULT; oim.im_max_slots = osb->max_slots; o2info_set_request_filled(&oim.im_req); if (o2info_to_user(oim, req)) - goto bail; + return -EFAULT; - status = 0; -bail: - if (status) - o2info_set_request_error(&oim.im_req, req); - - return status; + return 0; } static int ocfs2_info_handle_label(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_label oil; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oil, req)) - goto bail; + return -EFAULT; memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN); o2info_set_request_filled(&oil.il_req); if (o2info_to_user(oil, req)) - goto bail; + return -EFAULT; - status = 0; -bail: - if (status) - o2info_set_request_error(&oil.il_req, req); - - return status; + return 0; } static int ocfs2_info_handle_uuid(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_uuid oiu; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oiu, req)) - goto bail; + return -EFAULT; memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1); o2info_set_request_filled(&oiu.iu_req); if (o2info_to_user(oiu, req)) - goto bail; + return -EFAULT; - status = 0; -bail: - if (status) - o2info_set_request_error(&oiu.iu_req, req); - - return status; + return 0; } static int ocfs2_info_handle_fs_features(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_fs_features oif; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oif, req)) - goto bail; + return -EFAULT; oif.if_compat_features = osb->s_feature_compat; oif.if_incompat_features = osb->s_feature_incompat; @@ -284,39 +252,28 @@ static int ocfs2_info_handle_fs_features(struct inode *inode, o2info_set_request_filled(&oif.if_req); if (o2info_to_user(oif, req)) - goto bail; + return -EFAULT; - status = 0; -bail: - if (status) - o2info_set_request_error(&oif.if_req, req); - - return status; + return 0; } static int ocfs2_info_handle_journal_size(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_journal_size oij; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oij, req)) - goto bail; + return -EFAULT; oij.ij_journal_size = i_size_read(osb->journal->j_inode); o2info_set_request_filled(&oij.ij_req); if (o2info_to_user(oij, req)) - goto bail; + return -EFAULT; - status = 0; -bail: - if (status) - o2info_set_request_error(&oij.ij_req, req); - - return status; + return 0; } static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, @@ -373,7 +330,7 @@ static int ocfs2_info_handle_freeinode(struct inode *inode, u32 i; u64 blkno = -1; char namebuf[40]; - int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE; + int status, type = INODE_ALLOC_SYSTEM_INODE; struct ocfs2_info_freeinode *oifi = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct inode *inode_alloc = NULL; @@ -385,8 +342,10 @@ static int ocfs2_info_handle_freeinode(struct inode *inode, goto out_err; } - if (o2info_from_user(*oifi, req)) - goto bail; + if (o2info_from_user(*oifi, req)) { + status = -EFAULT; + goto out_free; + } oifi->ifi_slotnum = osb->max_slots; @@ -424,14 +383,16 @@ static int ocfs2_info_handle_freeinode(struct inode *inode, o2info_set_request_filled(&oifi->ifi_req); - if (o2info_to_user(*oifi, req)) - goto bail; + if (o2info_to_user(*oifi, req)) { + status = -EFAULT; + goto out_free; + } status = 0; bail: if (status) o2info_set_request_error(&oifi->ifi_req, req); - +out_free: kfree(oifi); out_err: return status; @@ -658,7 +619,7 @@ static int ocfs2_info_handle_freefrag(struct inode *inode, { u64 blkno = -1; char namebuf[40]; - int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE; + int status, type = GLOBAL_BITMAP_SYSTEM_INODE; struct ocfs2_info_freefrag *oiff; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); @@ -671,8 +632,10 @@ static int ocfs2_info_handle_freefrag(struct inode *inode, goto out_err; } - if (o2info_from_user(*oiff, req)) - goto bail; + if (o2info_from_user(*oiff, req)) { + status = -EFAULT; + goto out_free; + } /* * chunksize from userspace should be power of 2. */ @@ -711,14 +674,14 @@ static int ocfs2_info_handle_freefrag(struct inode *inode, if (o2info_to_user(*oiff, req)) { status = -EFAULT; - goto bail; + goto out_free; } status = 0; bail: if (status) o2info_set_request_error(&oiff->iff_req, req); - +out_free: kfree(oiff); out_err: return status; @@ -727,23 +690,17 @@ out_err: static int ocfs2_info_handle_unknown(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_request oir; if (o2info_from_user(oir, req)) - goto bail; + return -EFAULT; o2info_clear_request_filled(&oir); if (o2info_to_user(oir, req)) - goto bail; + return -EFAULT; - status = 0; -bail: - if (status) - o2info_set_request_error(&oir, req); - - return status; + return 0; } /* From c43c363def04cdaed0d9e26dae846081f55714e7 Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Fri, 29 Aug 2014 15:19:00 -0700 Subject: [PATCH 16/22] ocfs2: o2net: don't shutdown connection when idle timeout This patch series is to fix a possible message lost bug in ocfs2 when network go bad. This bug will cause ocfs2 hung forever even network become good again. The messages may lost in this case. After the tcp connection is established between two nodes, an idle timer will be set to check its state periodically, if no messages are received during this time, idle timer will timeout, it will shutdown the connection and try to reconnect, so pending messages in tcp queues will be lost. This messages may be from dlm. Dlm may get hung in this case. This may cause the whole ocfs2 cluster hung. This is very possible to happen when network state goes bad. Do the reconnect is useless, it will fail if network state is still bad. Just waiting there for network recovering may be a good idea, it will not lost messages and some node will be fenced until cluster goes into split-brain state, for this case, Tcp user timeout is used to override the tcp retransmit timeout. It will timeout after 25 days, user should have notice this through the provided log and fix the network, if they don't, ocfs2 will fall back to original reconnect way. This patch (of 3): Some messages in the tcp queue maybe lost if we shutdown the connection and reconnect when idle timeout. If packets lost and reconnect success, then the ocfs2 cluster maybe hung. To fix this, we can leave the connection there and do the fence decision when idle timeout, if network recover before fence dicision is made, the connection survive without lost any messages. This bug can be saw when network state go bad. It may cause ocfs2 hung forever if some packets lost. With this fix, ocfs2 will recover from hung if network becomes good again. Signed-off-by: Junxiao Bi Reviewed-by: Srinivas Eeda Reviewed-by: Mark Fasheh Cc: Joel Becker Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/tcp.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 681691bc233a..2334bfc966c1 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1536,16 +1536,20 @@ static void o2net_idle_timer(unsigned long data) #endif printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been " - "idle for %lu.%lu secs, shutting it down.\n", SC_NODEF_ARGS(sc), - msecs / 1000, msecs % 1000); + "idle for %lu.%lu secs.\n", + SC_NODEF_ARGS(sc), msecs / 1000, msecs % 1000); - /* - * Initialize the nn_timeout so that the next connection attempt - * will continue in o2net_start_connect. + /* idle timerout happen, don't shutdown the connection, but + * make fence decision. Maybe the connection can recover before + * the decision is made. */ atomic_set(&nn->nn_timeout, 1); + o2quo_conn_err(o2net_num_from_nn(nn)); + queue_delayed_work(o2net_wq, &nn->nn_still_up, + msecs_to_jiffies(O2NET_QUORUM_DELAY_MS)); + + o2net_sc_reset_idle_timer(sc); - o2net_sc_queue_work(sc, &sc->sc_shutdown_work); } static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc) @@ -1560,6 +1564,15 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc) static void o2net_sc_postpone_idle(struct o2net_sock_container *sc) { + struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); + + /* clear fence decision since the connection recover from timeout*/ + if (atomic_read(&nn->nn_timeout)) { + o2quo_conn_up(o2net_num_from_nn(nn)); + cancel_delayed_work(&nn->nn_still_up); + atomic_set(&nn->nn_timeout, 0); + } + /* Only push out an existing timer */ if (timer_pending(&sc->sc_idle_timeout)) o2net_sc_reset_idle_timer(sc); From 8e9801dfe37c9e68cdbfcd15988df2187191864e Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Fri, 29 Aug 2014 15:19:02 -0700 Subject: [PATCH 17/22] ocfs2: o2net: set tcp user timeout to max value When tcp retransmit timeout(15mins), the connection will be closed. Pending messages may be lost during this time. So we set tcp user timeout to override the retransmit timeout to the max value. This is OK for ocfs2 since we have disk heartbeat, if peer crash, the disk heartbeat will timeout and it will be evicted, if disk heartbeat not timeout and connection idle for a long time, then this means the cluster enters split-brain state, since fence can't happen, we'd better keep the connection and wait network recover. Signed-off-by: Junxiao Bi Reviewed-by: Srinivas Eeda Reviewed-by: Mark Fasheh Cc: Joel Becker Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/tcp.c | 20 ++++++++++++++++++++ fs/ocfs2/cluster/tcp.h | 1 + 2 files changed, 21 insertions(+) diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 2334bfc966c1..ea34952f9496 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1480,6 +1480,14 @@ static int o2net_set_nodelay(struct socket *sock) return ret; } +static int o2net_set_usertimeout(struct socket *sock) +{ + int user_timeout = O2NET_TCP_USER_TIMEOUT; + + return kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT, + (char *)&user_timeout, sizeof(user_timeout)); +} + static void o2net_initialize_handshake(void) { o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32( @@ -1663,6 +1671,12 @@ static void o2net_start_connect(struct work_struct *work) goto out; } + ret = o2net_set_usertimeout(sock); + if (ret) { + mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret); + goto out; + } + o2net_register_callbacks(sc->sc_sock->sk, sc); spin_lock(&nn->nn_lock); @@ -1844,6 +1858,12 @@ static int o2net_accept_one(struct socket *sock, int *more) goto out; } + ret = o2net_set_usertimeout(new_sock); + if (ret) { + mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret); + goto out; + } + slen = sizeof(sin); ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin, &slen, 1); diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h index 5bada2a69b50..c571e849fda4 100644 --- a/fs/ocfs2/cluster/tcp.h +++ b/fs/ocfs2/cluster/tcp.h @@ -63,6 +63,7 @@ typedef void (o2net_post_msg_handler_func)(int status, void *data, #define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 2000 #define O2NET_IDLE_TIMEOUT_MS_DEFAULT 30000 +#define O2NET_TCP_USER_TIMEOUT 0x7fffffff /* TODO: figure this out.... */ static inline int o2net_link_down(int err, struct socket *sock) From 8c7b638cece146234b0c0d5f6ba84d1cf6f81e83 Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Fri, 29 Aug 2014 15:19:04 -0700 Subject: [PATCH 18/22] ocfs2: quorum: add a log for node not fenced For debug use, we can see from the log whether the fence decision is made and why it is not fenced. Signed-off-by: Junxiao Bi Reviewed-by: Srinivas Eeda Reviewed-by: Mark Fasheh Cc: Joel Becker Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/quorum.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index 1ec141e758d7..62e8ec619b4c 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -160,9 +160,18 @@ static void o2quo_make_decision(struct work_struct *work) } out: - spin_unlock(&qs->qs_lock); - if (fence) + if (fence) { + spin_unlock(&qs->qs_lock); o2quo_fence_self(); + } else { + mlog(ML_NOTICE, "not fencing this node, heartbeating: %d, " + "connected: %d, lowest: %d (%sreachable)\n", + qs->qs_heartbeating, qs->qs_connected, lowest_hb, + lowest_reachable ? "" : "un"); + spin_unlock(&qs->qs_lock); + + } + } static void o2quo_set_hold(struct o2quo_state *qs, u8 node) From 498b473af9c20a4cb533297dc43b063f35f86349 Mon Sep 17 00:00:00 2001 From: Phong Tran Date: Fri, 29 Aug 2014 15:19:06 -0700 Subject: [PATCH 19/22] tools: selftests: fix build issue with make kselftests target Fix the typo of ARCH when running 'make kselftests'. Change the 'X86' to 'x86'. Test by compilation. Signed-off-by: Phong Tran Cc: David Herrmann Cc: Hugh Dickins Cc: Shuah Khan Cc: Sam Ravnborg Cc: Michal Marek Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/ipc/Makefile | 6 +++--- tools/testing/selftests/kcmp/Makefile | 6 +++--- tools/testing/selftests/memfd/Makefile | 10 +++++----- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/ipc/Makefile b/tools/testing/selftests/ipc/Makefile index 5386fd7c43ae..74bbefdeaf4c 100644 --- a/tools/testing/selftests/ipc/Makefile +++ b/tools/testing/selftests/ipc/Makefile @@ -1,18 +1,18 @@ uname_M := $(shell uname -m 2>/dev/null || echo not) ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/) ifeq ($(ARCH),i386) - ARCH := X86 + ARCH := x86 CFLAGS := -DCONFIG_X86_32 -D__i386__ endif ifeq ($(ARCH),x86_64) - ARCH := X86 + ARCH := x86 CFLAGS := -DCONFIG_X86_64 -D__x86_64__ endif CFLAGS += -I../../../../usr/include/ all: -ifeq ($(ARCH),X86) +ifeq ($(ARCH),x86) gcc $(CFLAGS) msgque.c -o msgque_test else echo "Not an x86 target, can't build msgque selftest" diff --git a/tools/testing/selftests/kcmp/Makefile b/tools/testing/selftests/kcmp/Makefile index d7d6bbeeff2f..8aabd82db9e4 100644 --- a/tools/testing/selftests/kcmp/Makefile +++ b/tools/testing/selftests/kcmp/Makefile @@ -1,11 +1,11 @@ uname_M := $(shell uname -m 2>/dev/null || echo not) ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/) ifeq ($(ARCH),i386) - ARCH := X86 + ARCH := x86 CFLAGS := -DCONFIG_X86_32 -D__i386__ endif ifeq ($(ARCH),x86_64) - ARCH := X86 + ARCH := x86 CFLAGS := -DCONFIG_X86_64 -D__x86_64__ endif @@ -15,7 +15,7 @@ CFLAGS += -I../../../../usr/include/ CFLAGS += -I../../../../arch/x86/include/ all: -ifeq ($(ARCH),X86) +ifeq ($(ARCH),x86) gcc $(CFLAGS) kcmp_test.c -o kcmp_test else echo "Not an x86 target, can't build kcmp selftest" diff --git a/tools/testing/selftests/memfd/Makefile b/tools/testing/selftests/memfd/Makefile index 6816c491c5ff..ad4ab01cd28f 100644 --- a/tools/testing/selftests/memfd/Makefile +++ b/tools/testing/selftests/memfd/Makefile @@ -1,10 +1,10 @@ uname_M := $(shell uname -m 2>/dev/null || echo not) ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/) ifeq ($(ARCH),i386) - ARCH := X86 + ARCH := x86 endif ifeq ($(ARCH),x86_64) - ARCH := X86 + ARCH := x86 endif CFLAGS += -D_FILE_OFFSET_BITS=64 @@ -14,20 +14,20 @@ CFLAGS += -I../../../../include/uapi/ CFLAGS += -I../../../../include/ all: -ifeq ($(ARCH),X86) +ifeq ($(ARCH),x86) gcc $(CFLAGS) memfd_test.c -o memfd_test else echo "Not an x86 target, can't build memfd selftest" endif run_tests: all -ifeq ($(ARCH),X86) +ifeq ($(ARCH),x86) gcc $(CFLAGS) memfd_test.c -o memfd_test endif @./memfd_test || echo "memfd_test: [FAIL]" build_fuse: -ifeq ($(ARCH),X86) +ifeq ($(ARCH),x86) gcc $(CFLAGS) fuse_mnt.c `pkg-config fuse --cflags --libs` -o fuse_mnt gcc $(CFLAGS) fuse_test.c -o fuse_test else From e3560305192cd51b3c07206c85eb4231594dd58b Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Fri, 29 Aug 2014 15:19:09 -0700 Subject: [PATCH 20/22] flush_icache_range: export symbol to fix build errors Fix building errors occuring due to a missing export of flush_icache_range() in kisskb.ellerman.id.au/kisskb/buildresult/11677809/ ERROR: "flush_icache_range" [drivers/misc/lkdtm.ko] undefined! Signed-off-by: Pranith Kumar Reported-by: Geert Uytterhoeven Acked-by: Vineet Gupta [arc] Acked-by: Richard Kuo [hexagon] Cc: Chris Metcalf Cc: Chris Zankel Acked-by: Max Filippov [xtensa] Cc: Noam Camus Cc: Masami Hiramatsu Acked-by: Zhigang Lu [tile] Cc: Kirill Tkhai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arc/mm/cache_arc700.c | 1 + arch/hexagon/mm/cache.c | 1 + arch/sh/mm/cache.c | 1 + arch/tile/kernel/smp.c | 1 + arch/xtensa/kernel/smp.c | 1 + 5 files changed, 5 insertions(+) diff --git a/arch/arc/mm/cache_arc700.c b/arch/arc/mm/cache_arc700.c index 4670afc3b971..e88ddbf990e3 100644 --- a/arch/arc/mm/cache_arc700.c +++ b/arch/arc/mm/cache_arc700.c @@ -581,6 +581,7 @@ void flush_icache_range(unsigned long kstart, unsigned long kend) tot_sz -= sz; } } +EXPORT_SYMBOL(flush_icache_range); /* * General purpose helper to make I and D cache lines consistent. diff --git a/arch/hexagon/mm/cache.c b/arch/hexagon/mm/cache.c index fe14ccf28561..0c76c802e31c 100644 --- a/arch/hexagon/mm/cache.c +++ b/arch/hexagon/mm/cache.c @@ -68,6 +68,7 @@ void flush_icache_range(unsigned long start, unsigned long end) ); local_irq_restore(flags); } +EXPORT_SYMBOL(flush_icache_range); void hexagon_clean_dcache_range(unsigned long start, unsigned long end) { diff --git a/arch/sh/mm/cache.c b/arch/sh/mm/cache.c index 097c2cdd117f..f770e3992620 100644 --- a/arch/sh/mm/cache.c +++ b/arch/sh/mm/cache.c @@ -229,6 +229,7 @@ void flush_icache_range(unsigned long start, unsigned long end) cacheop_on_each_cpu(local_flush_icache_range, (void *)&data, 1); } +EXPORT_SYMBOL(flush_icache_range); void flush_icache_page(struct vm_area_struct *vma, struct page *page) { diff --git a/arch/tile/kernel/smp.c b/arch/tile/kernel/smp.c index 01e8ab29f43a..19eaa62d456a 100644 --- a/arch/tile/kernel/smp.c +++ b/arch/tile/kernel/smp.c @@ -183,6 +183,7 @@ void flush_icache_range(unsigned long start, unsigned long end) preempt_enable(); } } +EXPORT_SYMBOL(flush_icache_range); /* Called when smp_send_reschedule() triggers IRQ_RESCHEDULE. */ diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c index 40b5a3771fb0..4d02e38514f5 100644 --- a/arch/xtensa/kernel/smp.c +++ b/arch/xtensa/kernel/smp.c @@ -571,6 +571,7 @@ void flush_icache_range(unsigned long start, unsigned long end) }; on_each_cpu(ipi_flush_icache_range, &fd, 1); } +EXPORT_SYMBOL(flush_icache_range); /* ------------------------------------------------------------------------- */ From 16b0371a2e2ba1a2dea84a18356b9bc7168a5b12 Mon Sep 17 00:00:00 2001 From: HuKeping Date: Fri, 29 Aug 2014 15:19:11 -0700 Subject: [PATCH 21/22] Documentation/kdump/kdump.txt: add ARM description Add arm specific parts to kdump kernel documentation. Signed-off-by: Hu Keping Acked-by: Vivek Goyal Cc: Haren Myneni Cc: Rob Landley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kdump/kdump.txt | 36 ++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/Documentation/kdump/kdump.txt b/Documentation/kdump/kdump.txt index 88d5a863712a..6c0b9f27e465 100644 --- a/Documentation/kdump/kdump.txt +++ b/Documentation/kdump/kdump.txt @@ -18,7 +18,7 @@ memory image to a dump file on the local disk, or across the network to a remote system. Kdump and kexec are currently supported on the x86, x86_64, ppc64, ia64, -and s390x architectures. +s390x and arm architectures. When the system kernel boots, it reserves a small section of memory for the dump-capture kernel. This ensures that ongoing Direct Memory Access @@ -112,7 +112,7 @@ There are two possible methods of using Kdump. 2) Or use the system kernel binary itself as dump-capture kernel and there is no need to build a separate dump-capture kernel. This is possible only with the architectures which support a relocatable kernel. As - of today, i386, x86_64, ppc64 and ia64 architectures support relocatable + of today, i386, x86_64, ppc64, ia64 and arm architectures support relocatable kernel. Building a relocatable kernel is advantageous from the point of view that @@ -241,6 +241,13 @@ Dump-capture kernel config options (Arch Dependent, ia64) kernel will be aligned to 64Mb, so if the start address is not then any space below the alignment point will be wasted. +Dump-capture kernel config options (Arch Dependent, arm) +---------------------------------------------------------- + +- To use a relocatable kernel, + Enable "AUTO_ZRELADDR" support under "Boot" options: + + AUTO_ZRELADDR=y Extended crashkernel syntax =========================== @@ -256,6 +263,10 @@ The syntax is: crashkernel=:[,:,...][@offset] range=start-[end] +Please note, on arm, the offset is required. + crashkernel=:[,:,...]@offset + range=start-[end] + 'start' is inclusive and 'end' is exclusive. For example: @@ -296,6 +307,12 @@ Boot into System Kernel on the memory consumption of the kdump system. In general this is not dependent on the memory size of the production system. + On arm, use "crashkernel=Y@X". Note that the start address of the kernel + will be aligned to 128MiB (0x08000000), so if the start address is not then + any space below the alignment point may be overwritten by the dump-capture kernel, + which means it is possible that the vmcore is not that precise as expected. + + Load the Dump-capture Kernel ============================ @@ -315,7 +332,8 @@ For ia64: - Use vmlinux or vmlinuz.gz For s390x: - Use image or bzImage - +For arm: + - Use zImage If you are using a uncompressed vmlinux image then use following command to load dump-capture kernel. @@ -331,6 +349,15 @@ to load dump-capture kernel. --initrd= \ --append="root= " +If you are using a compressed zImage, then use following command +to load dump-capture kernel. + + kexec --type zImage -p \ + --initrd= \ + --dtb= \ + --append="root= " + + Please note, that --args-linux does not need to be specified for ia64. It is planned to make this a no-op on that architecture, but for now it should be omitted @@ -347,6 +374,9 @@ For ppc64: For s390x: "1 maxcpus=1 cgroup_disable=memory" +For arm: + "1 maxcpus=1 reset_devices" + Notes on loading the dump-capture kernel: * By default, the ELF headers are stored in ELF64 format to support From b0108f9e93d0d39050eaa11358852f349bdccb71 Mon Sep 17 00:00:00 2001 From: Michael Welling Date: Fri, 29 Aug 2014 15:19:13 -0700 Subject: [PATCH 22/22] kexec: purgatory: add clean-up for purgatory directory Without this patch the kexec-purgatory.c and purgatory.ro files are not removed after make mrproper. Signed-off-by: Michael Welling Acked-by: Vivek Goyal Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index c96bcec544fc..60087ca37679 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -251,6 +251,7 @@ archclean: $(Q)rm -rf $(objtree)/arch/x86_64 $(Q)$(MAKE) $(clean)=$(boot) $(Q)$(MAKE) $(clean)=arch/x86/tools + $(Q)$(MAKE) $(clean)=arch/x86/purgatory PHONY += kvmconfig kvmconfig: