From cec3ebd083d4e8d161d0b18894c78e3311bcd026 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:56:25 -0700 Subject: [PATCH 01/38] mm/memory_hotplug: simplify and fix check_hotplug_memory_range() Patch series "mm/memory_hotplug: Factor out memory block devicehandling", v3. We only want memory block devices for memory to be onlined/offlined (add/remove from the buddy). This is required so user space can online/offline memory and kdump gets notified about newly onlined memory. Let's factor out creation/removal of memory block devices. This helps to further cleanup arch_add_memory/arch_remove_memory() and to make implementation of new features easier - especially sub-section memory hot add from Dan. Anshuman Khandual is currently working on arch_remove_memory(). I added a temporary solution via "arm64/mm: Add temporary arch_remove_memory() implementation", that is sufficient as a firsts tep in the context of this series. (we don't cleanup page tables in case anything goes wrong already) Did a quick sanity test with DIMM plug/unplug, making sure all devices and sysfs links properly get added/removed. Compile tested on s390x and x86-64. This patch (of 11): By converting start and size to page granularity, we actually ignore unaligned parts within a page instead of properly bailing out with an error. Link: http://lkml.kernel.org/r/20190527111152.16324-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Dan Williams Reviewed-by: Wei Yang Reviewed-by: Pavel Tatashin Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Cc: David Hildenbrand Cc: Qian Cai Cc: Arun KS Cc: Mathieu Malaterre Cc: Alex Deucher Cc: Andrew Banman Cc: Andy Lutomirski Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Baoquan He Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Chintan Pandya Cc: Christophe Leroy Cc: Chris Wilson Cc: Dave Hansen Cc: "David S. Miller" Cc: Fenghua Yu Cc: Greg Kroah-Hartman Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jonathan Cameron Cc: Joonsoo Kim Cc: Jun Yao Cc: "Kirill A. Shutemov" Cc: Logan Gunthorpe Cc: Mark Brown Cc: Mark Rutland Cc: Masahiro Yamada Cc: Michael Ellerman Cc: Mike Rapoport Cc: "mike.travis@hpe.com" Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Peter Zijlstra Cc: "Rafael J. Wysocki" Cc: Rich Felker Cc: Rob Herring Cc: Robin Murphy Cc: Thomas Gleixner Cc: Tony Luck Cc: Vasily Gorbik Cc: Will Deacon Cc: Yoshinori Sato Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 4ebe696138e8..a8c25fd85ee3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1049,16 +1049,11 @@ int try_online_node(int nid) static int check_hotplug_memory_range(u64 start, u64 size) { - unsigned long block_sz = memory_block_size_bytes(); - u64 block_nr_pages = block_sz >> PAGE_SHIFT; - u64 nr_pages = size >> PAGE_SHIFT; - u64 start_pfn = PFN_DOWN(start); - /* memory range must be block size aligned */ - if (!nr_pages || !IS_ALIGNED(start_pfn, block_nr_pages) || - !IS_ALIGNED(nr_pages, block_nr_pages)) { + if (!size || !IS_ALIGNED(start, memory_block_size_bytes()) || + !IS_ALIGNED(size, memory_block_size_bytes())) { pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx", - block_sz, start, size); + memory_block_size_bytes(), start, size); return -EINVAL; } From 973de24a78493d115ec157c68fd31bc0a114134e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:56:30 -0700 Subject: [PATCH 02/38] s390x/mm: fail when an altmap is used for arch_add_memory() ZONE_DEVICE is not yet supported, fail if an altmap is passed, so we don't forget arch_add_memory()/arch_remove_memory() when unlocking support. Link: http://lkml.kernel.org/r/20190527111152.16324-3-david@redhat.com Signed-off-by: David Hildenbrand Suggested-by: Dan Williams Cc: Heiko Carstens Cc: Michal Hocko Cc: Mike Rapoport Cc: David Hildenbrand Cc: Vasily Gorbik Cc: Oscar Salvador Cc: Alex Deucher Cc: Andrew Banman Cc: Andy Lutomirski Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Arun KS Cc: Baoquan He Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Chintan Pandya Cc: Christophe Leroy Cc: Chris Wilson Cc: Dave Hansen Cc: "David S. Miller" Cc: Fenghua Yu Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jonathan Cameron Cc: Joonsoo Kim Cc: Jun Yao Cc: "Kirill A. Shutemov" Cc: Logan Gunthorpe Cc: Mark Brown Cc: Mark Rutland Cc: Masahiro Yamada Cc: Mathieu Malaterre Cc: Michael Ellerman Cc: Mike Rapoport Cc: "mike.travis@hpe.com" Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Paul Mackerras Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Qian Cai Cc: "Rafael J. Wysocki" Cc: Rich Felker Cc: Rob Herring Cc: Robin Murphy Cc: Thomas Gleixner Cc: Tony Luck Cc: Wei Yang Cc: Will Deacon Cc: Yoshinori Sato Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/mm/init.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index f0bee6af3960..7d6638c18cb4 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -273,6 +273,9 @@ int arch_add_memory(int nid, u64 start, u64 size, unsigned long size_pages = PFN_DOWN(size); int rc; + if (WARN_ON_ONCE(restrictions->altmap)) + return -EINVAL; + rc = vmem_add_mapping(start, size); if (rc) return rc; From 18c86506c80f6b6b5e67d95bf0d6f7e665de5239 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:56:35 -0700 Subject: [PATCH 03/38] s390x/mm: implement arch_remove_memory() Will come in handy when wanting to handle errors after arch_add_memory(). Link: http://lkml.kernel.org/r/20190527111152.16324-4-david@redhat.com Signed-off-by: David Hildenbrand Cc: Heiko Carstens Cc: Michal Hocko Cc: Mike Rapoport Cc: David Hildenbrand Cc: Vasily Gorbik Cc: Oscar Salvador Cc: Alex Deucher Cc: Andrew Banman Cc: Andy Lutomirski Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Arun KS Cc: Baoquan He Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Chintan Pandya Cc: Christophe Leroy Cc: Chris Wilson Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Fenghua Yu Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jonathan Cameron Cc: Joonsoo Kim Cc: Jun Yao Cc: "Kirill A. Shutemov" Cc: Logan Gunthorpe Cc: Mark Brown Cc: Mark Rutland Cc: Masahiro Yamada Cc: Mathieu Malaterre Cc: Michael Ellerman Cc: Mike Rapoport Cc: "mike.travis@hpe.com" Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Paul Mackerras Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Qian Cai Cc: "Rafael J. Wysocki" Cc: Rich Felker Cc: Rob Herring Cc: Robin Murphy Cc: Thomas Gleixner Cc: Tony Luck Cc: Wei Yang Cc: Will Deacon Cc: Yoshinori Sato Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/mm/init.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 7d6638c18cb4..5b1ec2f532e0 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -290,12 +290,13 @@ int arch_add_memory(int nid, u64 start, u64 size, void arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) { - /* - * There is no hardware or firmware interface which could trigger a - * hot memory remove on s390. So there is nothing that needs to be - * implemented. - */ - BUG(); + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + struct zone *zone; + + zone = page_zone(pfn_to_page(start_pfn)); + __remove_pages(zone, start_pfn, nr_pages, altmap); + vmem_remove_mapping(start, size); } #endif #endif /* CONFIG_MEMORY_HOTPLUG */ From 22eb634632a2359769f8a2a91a41d3c566a0a450 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:56:41 -0700 Subject: [PATCH 04/38] arm64/mm: add temporary arch_remove_memory() implementation A proper arch_remove_memory() implementation is on its way, which also cleanly removes page tables in arch_add_memory() in case something goes wrong. As we want to use arch_remove_memory() in case something goes wrong during memory hotplug after arch_add_memory() finished, let's add a temporary hack that is sufficient enough until we get a proper implementation that cleans up page table entries. We will remove CONFIG_MEMORY_HOTREMOVE around this code in follow up patches. Link: http://lkml.kernel.org/r/20190527111152.16324-5-david@redhat.com Signed-off-by: David Hildenbrand Cc: Catalin Marinas Cc: Will Deacon Cc: Mark Rutland Cc: Ard Biesheuvel Cc: Chintan Pandya Cc: Mike Rapoport Cc: Jun Yao Cc: Yu Zhao Cc: Robin Murphy Cc: Anshuman Khandual Cc: Alex Deucher Cc: Andrew Banman Cc: Andy Lutomirski Cc: Arun KS Cc: Baoquan He Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christophe Leroy Cc: Chris Wilson Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Fenghua Yu Cc: Greg Kroah-Hartman Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jonathan Cameron Cc: Joonsoo Kim Cc: "Kirill A. Shutemov" Cc: Logan Gunthorpe Cc: Mark Brown Cc: Masahiro Yamada Cc: Mathieu Malaterre Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: "mike.travis@hpe.com" Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Oscar Salvador Cc: Paul Mackerras Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Qian Cai Cc: "Rafael J. Wysocki" Cc: Rich Felker Cc: Rob Herring Cc: Thomas Gleixner Cc: Tony Luck Cc: Vasily Gorbik Cc: Wei Yang Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/mmu.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index e661469cabdd..a21fa7e1167d 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1074,4 +1074,23 @@ int arch_add_memory(int nid, u64 start, u64 size, return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, restrictions); } +#ifdef CONFIG_MEMORY_HOTREMOVE +void arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + struct zone *zone; + + /* + * FIXME: Cleanup page tables (also in arch_add_memory() in case + * adding fails). Until then, this function should only be used + * during memory hotplug (adding memory), not for memory + * unplug. ARCH_ENABLE_MEMORY_HOTREMOVE must not be + * unlocked yet. + */ + zone = page_zone(pfn_to_page(start_pfn)); + __remove_pages(zone, start_pfn, nr_pages, altmap); +} +#endif #endif From 1811582587c43bdf13d690d83345610d4df433bb Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:56:46 -0700 Subject: [PATCH 05/38] drivers/base/memory: pass a block_id to init_memory_block() We'll rework hotplug_memory_register() shortly, so it no longer consumes pass a section. [cai@lca.pw: fix a compilation warning] Link: http://lkml.kernel.org/r/1559320186-28337-1-git-send-email-cai@lca.pw Link: http://lkml.kernel.org/r/20190527111152.16324-6-david@redhat.com Signed-off-by: David Hildenbrand Signed-off-by: Qian Cai Acked-by: Michal Hocko Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Alex Deucher Cc: Andrew Banman Cc: Andy Lutomirski Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Arun KS Cc: Baoquan He Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Chintan Pandya Cc: Christophe Leroy Cc: Chris Wilson Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Fenghua Yu Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jonathan Cameron Cc: Joonsoo Kim Cc: Jun Yao Cc: "Kirill A. Shutemov" Cc: Logan Gunthorpe Cc: Mark Brown Cc: Mark Rutland Cc: Masahiro Yamada Cc: Mathieu Malaterre Cc: Michael Ellerman Cc: Mike Rapoport Cc: "mike.travis@hpe.com" Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Oscar Salvador Cc: Paul Mackerras Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Rich Felker Cc: Rob Herring Cc: Robin Murphy Cc: Thomas Gleixner Cc: Tony Luck Cc: Vasily Gorbik Cc: Wei Yang Cc: Will Deacon Cc: Yoshinori Sato Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index f180427e48f4..e0aa7f9abb36 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -651,21 +651,18 @@ int register_memory(struct memory_block *memory) return ret; } -static int init_memory_block(struct memory_block **memory, - struct mem_section *section, unsigned long state) +static int init_memory_block(struct memory_block **memory, int block_id, + unsigned long state) { struct memory_block *mem; unsigned long start_pfn; - int scn_nr; int ret = 0; mem = kzalloc(sizeof(*mem), GFP_KERNEL); if (!mem) return -ENOMEM; - scn_nr = __section_nr(section); - mem->start_section_nr = - base_memory_block_id(scn_nr) * sections_per_block; + mem->start_section_nr = block_id * sections_per_block; mem->end_section_nr = mem->start_section_nr + sections_per_block - 1; mem->state = state; start_pfn = section_nr_to_pfn(mem->start_section_nr); @@ -680,21 +677,18 @@ static int init_memory_block(struct memory_block **memory, static int add_memory_block(int base_section_nr) { struct memory_block *mem; - int i, ret, section_count = 0, section_nr; + int i, ret, section_count = 0; for (i = base_section_nr; i < base_section_nr + sections_per_block; - i++) { - if (!present_section_nr(i)) - continue; - if (section_count == 0) - section_nr = i; - section_count++; - } + i++) + if (present_section_nr(i)) + section_count++; if (section_count == 0) return 0; - ret = init_memory_block(&mem, __nr_to_section(section_nr), MEM_ONLINE); + ret = init_memory_block(&mem, base_memory_block_id(base_section_nr), + MEM_ONLINE); if (ret) return ret; mem->section_count = section_count; @@ -707,6 +701,7 @@ static int add_memory_block(int base_section_nr) */ int hotplug_memory_register(int nid, struct mem_section *section) { + int block_id = base_memory_block_id(__section_nr(section)); int ret = 0; struct memory_block *mem; @@ -717,7 +712,7 @@ int hotplug_memory_register(int nid, struct mem_section *section) mem->section_count++; put_device(&mem->dev); } else { - ret = init_memory_block(&mem, section, MEM_OFFLINE); + ret = init_memory_block(&mem, block_id, MEM_OFFLINE); if (ret) goto out; mem->section_count++; From 80ec922dbd87fd38d15719c86a94457204648aeb Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:56:51 -0700 Subject: [PATCH 06/38] mm/memory_hotplug: allow arch_remove_memory() without CONFIG_MEMORY_HOTREMOVE We want to improve error handling while adding memory by allowing to use arch_remove_memory() and __remove_pages() even if CONFIG_MEMORY_HOTREMOVE is not set to e.g., implement something like: arch_add_memory() rc = do_something(); if (rc) { arch_remove_memory(); } We won't get rid of CONFIG_MEMORY_HOTREMOVE for now, as it will require quite some dependencies for memory offlining. Link: http://lkml.kernel.org/r/20190527111152.16324-7-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Pavel Tatashin Cc: Tony Luck Cc: Fenghua Yu Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Heiko Carstens Cc: Yoshinori Sato Cc: Rich Felker Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Michal Hocko Cc: David Hildenbrand Cc: Oscar Salvador Cc: "Kirill A. Shutemov" Cc: Alex Deucher Cc: "David S. Miller" Cc: Mark Brown Cc: Chris Wilson Cc: Christophe Leroy Cc: Nicholas Piggin Cc: Vasily Gorbik Cc: Rob Herring Cc: Masahiro Yamada Cc: "mike.travis@hpe.com" Cc: Andrew Banman Cc: Arun KS Cc: Qian Cai Cc: Mathieu Malaterre Cc: Baoquan He Cc: Logan Gunthorpe Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Chintan Pandya Cc: Dan Williams Cc: Ingo Molnar Cc: Jonathan Cameron Cc: Joonsoo Kim Cc: Jun Yao Cc: Mark Rutland Cc: Mike Rapoport Cc: Oscar Salvador Cc: Robin Murphy Cc: Wei Yang Cc: Will Deacon Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/mmu.c | 2 -- arch/ia64/mm/init.c | 2 -- arch/powerpc/mm/mem.c | 2 -- arch/s390/mm/init.c | 2 -- arch/sh/mm/init.c | 2 -- arch/x86/mm/init_32.c | 2 -- arch/x86/mm/init_64.c | 2 -- drivers/base/memory.c | 2 -- include/linux/memory.h | 2 -- include/linux/memory_hotplug.h | 2 -- mm/memory_hotplug.c | 2 -- mm/sparse.c | 6 ------ 12 files changed, 28 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index a21fa7e1167d..750a69dde39b 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1074,7 +1074,6 @@ int arch_add_memory(int nid, u64 start, u64 size, return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, restrictions); } -#ifdef CONFIG_MEMORY_HOTREMOVE void arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) { @@ -1093,4 +1092,3 @@ void arch_remove_memory(int nid, u64 start, u64 size, __remove_pages(zone, start_pfn, nr_pages, altmap); } #endif -#endif diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index d28e29103bdb..aae75fd7b810 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -681,7 +681,6 @@ int arch_add_memory(int nid, u64 start, u64 size, return ret; } -#ifdef CONFIG_MEMORY_HOTREMOVE void arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) { @@ -693,4 +692,3 @@ void arch_remove_memory(int nid, u64 start, u64 size, __remove_pages(zone, start_pfn, nr_pages, altmap); } #endif -#endif diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 26a8da3723bb..9259337d7374 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -125,7 +125,6 @@ int __ref arch_add_memory(int nid, u64 start, u64 size, return __add_pages(nid, start_pfn, nr_pages, restrictions); } -#ifdef CONFIG_MEMORY_HOTREMOVE void __ref arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) { @@ -151,7 +150,6 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size, pr_warn("Hash collision while resizing HPT\n"); } #endif -#endif /* CONFIG_MEMORY_HOTPLUG */ #ifndef CONFIG_NEED_MULTIPLE_NODES void __init mem_topology_setup(void) diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 5b1ec2f532e0..4e5bbe328594 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -286,7 +286,6 @@ int arch_add_memory(int nid, u64 start, u64 size, return rc; } -#ifdef CONFIG_MEMORY_HOTREMOVE void arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) { @@ -298,5 +297,4 @@ void arch_remove_memory(int nid, u64 start, u64 size, __remove_pages(zone, start_pfn, nr_pages, altmap); vmem_remove_mapping(start, size); } -#endif #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 13c6a6bb5fd9..dfdbaa50946e 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -429,7 +429,6 @@ int memory_add_physaddr_to_nid(u64 addr) EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif -#ifdef CONFIG_MEMORY_HOTREMOVE void arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) { @@ -440,5 +439,4 @@ void arch_remove_memory(int nid, u64 start, u64 size, zone = page_zone(pfn_to_page(start_pfn)); __remove_pages(zone, start_pfn, nr_pages, altmap); } -#endif #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index f265a4316179..4068abb9427f 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -860,7 +860,6 @@ int arch_add_memory(int nid, u64 start, u64 size, return __add_pages(nid, start_pfn, nr_pages, restrictions); } -#ifdef CONFIG_MEMORY_HOTREMOVE void arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) { @@ -872,7 +871,6 @@ void arch_remove_memory(int nid, u64 start, u64 size, __remove_pages(zone, start_pfn, nr_pages, altmap); } #endif -#endif int kernel_set_to_readonly __read_mostly; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 08bbf648827b..5a289a2ab108 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1198,7 +1198,6 @@ void __ref vmemmap_free(unsigned long start, unsigned long end, remove_pagetable(start, end, false, altmap); } -#ifdef CONFIG_MEMORY_HOTREMOVE static void __meminit kernel_physical_mapping_remove(unsigned long start, unsigned long end) { @@ -1219,7 +1218,6 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size, __remove_pages(zone, start_pfn, nr_pages, altmap); kernel_physical_mapping_remove(start, start + size); } -#endif #endif /* CONFIG_MEMORY_HOTPLUG */ static struct kcore_list kcore_vsyscall; diff --git a/drivers/base/memory.c b/drivers/base/memory.c index e0aa7f9abb36..92459d6f12be 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -723,7 +723,6 @@ out: return ret; } -#ifdef CONFIG_MEMORY_HOTREMOVE static void unregister_memory(struct memory_block *memory) { @@ -762,7 +761,6 @@ void unregister_memory_section(struct mem_section *section) out_unlock: mutex_unlock(&mem_sysfs_mutex); } -#endif /* CONFIG_MEMORY_HOTREMOVE */ /* return true if the memory block is offlined, otherwise, return false */ bool is_memblock_offlined(struct memory_block *mem) diff --git a/include/linux/memory.h b/include/linux/memory.h index e1dc1bb2b787..474c7c60c8f2 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -112,9 +112,7 @@ extern void unregister_memory_notifier(struct notifier_block *nb); extern int register_memory_isolate_notifier(struct notifier_block *nb); extern void unregister_memory_isolate_notifier(struct notifier_block *nb); int hotplug_memory_register(int nid, struct mem_section *section); -#ifdef CONFIG_MEMORY_HOTREMOVE extern void unregister_memory_section(struct mem_section *); -#endif extern int memory_dev_init(void); extern int memory_notify(unsigned long val, void *v); extern int memory_isolate_notify(unsigned long val, void *v); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 988fde33cd7f..87bf9c4a889e 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -123,12 +123,10 @@ static inline bool movable_node_is_enabled(void) return movable_node_enabled; } -#ifdef CONFIG_MEMORY_HOTREMOVE extern void arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap); extern void __remove_pages(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap); -#endif /* CONFIG_MEMORY_HOTREMOVE */ /* * Do we want sysfs memblock files created. This will allow userspace to online diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a8c25fd85ee3..bc11888d5d7e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -318,7 +318,6 @@ out: return err; } -#ifdef CONFIG_MEMORY_HOTREMOVE /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ static unsigned long find_smallest_section_pfn(int nid, struct zone *zone, unsigned long start_pfn, @@ -580,7 +579,6 @@ void __remove_pages(struct zone *zone, unsigned long phys_start_pfn, set_zone_contiguous(zone); } -#endif /* CONFIG_MEMORY_HOTREMOVE */ int set_online_page_callback(online_page_callback_t callback) { diff --git a/mm/sparse.c b/mm/sparse.c index fd13166949b5..d1d5e05f5b8d 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -604,7 +604,6 @@ static void __kfree_section_memmap(struct page *memmap, vmemmap_free(start, end, altmap); } -#ifdef CONFIG_MEMORY_HOTREMOVE static void free_map_bootmem(struct page *memmap) { unsigned long start = (unsigned long)memmap; @@ -612,7 +611,6 @@ static void free_map_bootmem(struct page *memmap) vmemmap_free(start, end, NULL); } -#endif /* CONFIG_MEMORY_HOTREMOVE */ #else static struct page *__kmalloc_section_memmap(void) { @@ -651,7 +649,6 @@ static void __kfree_section_memmap(struct page *memmap, get_order(sizeof(struct page) * PAGES_PER_SECTION)); } -#ifdef CONFIG_MEMORY_HOTREMOVE static void free_map_bootmem(struct page *memmap) { unsigned long maps_section_nr, removing_section_nr, i; @@ -681,7 +678,6 @@ static void free_map_bootmem(struct page *memmap) put_page_bootmem(page); } } -#endif /* CONFIG_MEMORY_HOTREMOVE */ #endif /* CONFIG_SPARSEMEM_VMEMMAP */ /** @@ -746,7 +742,6 @@ out: return ret; } -#ifdef CONFIG_MEMORY_HOTREMOVE #ifdef CONFIG_MEMORY_FAILURE static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) { @@ -823,5 +818,4 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, PAGES_PER_SECTION - map_offset); free_section_usemap(memmap, usemap, altmap); } -#endif /* CONFIG_MEMORY_HOTREMOVE */ #endif /* CONFIG_MEMORY_HOTPLUG */ From db051a0dac13db24d58470d75cee0ce7c6b031a1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:56:56 -0700 Subject: [PATCH 07/38] mm/memory_hotplug: create memory block devices after arch_add_memory() Only memory to be added to the buddy and to be onlined/offlined by user space using /sys/devices/system/memory/... needs (and should have!) memory block devices. Factor out creation of memory block devices. Create all devices after arch_add_memory() succeeded. We can later drop the want_memblock parameter, because it is now effectively stale. Only after memory block devices have been added, memory can be onlined by user space. This implies, that memory is not visible to user space at all before arch_add_memory() succeeded. While at it - use WARN_ON_ONCE instead of BUG_ON in moved unregister_memory() - introduce find_memory_block_by_id() to search via block id - Use find_memory_block_by_id() in init_memory_block() to catch duplicates Link: http://lkml.kernel.org/r/20190527111152.16324-8-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Pavel Tatashin Acked-by: Michal Hocko Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: David Hildenbrand Cc: "mike.travis@hpe.com" Cc: Ingo Molnar Cc: Andrew Banman Cc: Oscar Salvador Cc: Qian Cai Cc: Wei Yang Cc: Arun KS Cc: Mathieu Malaterre Cc: Alex Deucher Cc: Andy Lutomirski Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Baoquan He Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Chintan Pandya Cc: Christophe Leroy Cc: Chris Wilson Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Fenghua Yu Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Jonathan Cameron Cc: Joonsoo Kim Cc: Jun Yao Cc: "Kirill A. Shutemov" Cc: Logan Gunthorpe Cc: Mark Brown Cc: Mark Rutland Cc: Masahiro Yamada Cc: Michael Ellerman Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Rich Felker Cc: Rob Herring Cc: Robin Murphy Cc: Thomas Gleixner Cc: Tony Luck Cc: Vasily Gorbik Cc: Will Deacon Cc: Yoshinori Sato Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 94 +++++++++++++++++++++++++++--------------- include/linux/memory.h | 2 +- mm/memory_hotplug.c | 15 +++---- 3 files changed, 69 insertions(+), 42 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 92459d6f12be..18a30c3ac0ef 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -39,6 +39,11 @@ static inline int base_memory_block_id(int section_nr) return section_nr / sections_per_block; } +static inline int pfn_to_block_id(unsigned long pfn) +{ + return base_memory_block_id(pfn_to_section_nr(pfn)); +} + static int memory_subsys_online(struct device *dev); static int memory_subsys_offline(struct device *dev); @@ -582,10 +587,9 @@ int __weak arch_get_memory_phys_device(unsigned long start_pfn) * A reference for the returned object is held and the reference for the * hinted object is released. */ -struct memory_block *find_memory_block_hinted(struct mem_section *section, - struct memory_block *hint) +static struct memory_block *find_memory_block_by_id(int block_id, + struct memory_block *hint) { - int block_id = base_memory_block_id(__section_nr(section)); struct device *hintdev = hint ? &hint->dev : NULL; struct device *dev; @@ -597,6 +601,14 @@ struct memory_block *find_memory_block_hinted(struct mem_section *section, return to_memory_block(dev); } +struct memory_block *find_memory_block_hinted(struct mem_section *section, + struct memory_block *hint) +{ + int block_id = base_memory_block_id(__section_nr(section)); + + return find_memory_block_by_id(block_id, hint); +} + /* * For now, we have a linear search to go find the appropriate * memory_block corresponding to a particular phys_index. If @@ -658,6 +670,11 @@ static int init_memory_block(struct memory_block **memory, int block_id, unsigned long start_pfn; int ret = 0; + mem = find_memory_block_by_id(block_id, NULL); + if (mem) { + put_device(&mem->dev); + return -EEXIST; + } mem = kzalloc(sizeof(*mem), GFP_KERNEL); if (!mem) return -ENOMEM; @@ -695,44 +712,53 @@ static int add_memory_block(int base_section_nr) return 0; } -/* - * need an interface for the VM to add new memory regions, - * but without onlining it. - */ -int hotplug_memory_register(int nid, struct mem_section *section) +static void unregister_memory(struct memory_block *memory) { - int block_id = base_memory_block_id(__section_nr(section)); - int ret = 0; - struct memory_block *mem; - - mutex_lock(&mem_sysfs_mutex); - - mem = find_memory_block(section); - if (mem) { - mem->section_count++; - put_device(&mem->dev); - } else { - ret = init_memory_block(&mem, block_id, MEM_OFFLINE); - if (ret) - goto out; - mem->section_count++; - } - -out: - mutex_unlock(&mem_sysfs_mutex); - return ret; -} - -static void -unregister_memory(struct memory_block *memory) -{ - BUG_ON(memory->dev.bus != &memory_subsys); + if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys)) + return; /* drop the ref. we got via find_memory_block() */ put_device(&memory->dev); device_unregister(&memory->dev); } +/* + * Create memory block devices for the given memory area. Start and size + * have to be aligned to memory block granularity. Memory block devices + * will be initialized as offline. + */ +int create_memory_block_devices(unsigned long start, unsigned long size) +{ + const int start_block_id = pfn_to_block_id(PFN_DOWN(start)); + int end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); + struct memory_block *mem; + unsigned long block_id; + int ret = 0; + + if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) || + !IS_ALIGNED(size, memory_block_size_bytes()))) + return -EINVAL; + + mutex_lock(&mem_sysfs_mutex); + for (block_id = start_block_id; block_id != end_block_id; block_id++) { + ret = init_memory_block(&mem, block_id, MEM_OFFLINE); + if (ret) + break; + mem->section_count = sections_per_block; + } + if (ret) { + end_block_id = block_id; + for (block_id = start_block_id; block_id != end_block_id; + block_id++) { + mem = find_memory_block_by_id(block_id, NULL); + mem->section_count = 0; + unregister_memory(mem); + } + } + mutex_unlock(&mem_sysfs_mutex); + return ret; +} + void unregister_memory_section(struct mem_section *section) { struct memory_block *mem; diff --git a/include/linux/memory.h b/include/linux/memory.h index 474c7c60c8f2..db3e8567f900 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -111,7 +111,7 @@ extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); extern int register_memory_isolate_notifier(struct notifier_block *nb); extern void unregister_memory_isolate_notifier(struct notifier_block *nb); -int hotplug_memory_register(int nid, struct mem_section *section); +int create_memory_block_devices(unsigned long start, unsigned long size); extern void unregister_memory_section(struct mem_section *); extern int memory_dev_init(void); extern int memory_notify(unsigned long val, void *v); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index bc11888d5d7e..78291526eb4d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -259,13 +259,7 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn, return -EEXIST; ret = sparse_add_one_section(nid, phys_start_pfn, altmap); - if (ret < 0) - return ret; - - if (!want_memblock) - return 0; - - return hotplug_memory_register(nid, __pfn_to_section(phys_start_pfn)); + return ret < 0 ? ret : 0; } /* @@ -1105,6 +1099,13 @@ int __ref add_memory_resource(int nid, struct resource *res) if (ret < 0) goto error; + /* create memory block devices after memory was added */ + ret = create_memory_block_devices(start, size); + if (ret) { + arch_remove_memory(nid, start, size, NULL); + goto error; + } + if (new_node) { /* If sysfs file of new node can't be created, cpu on the node * can't be hot-added. There is no rollback way now. From 05f800a0bd08e14606ac63e0a5c63ed6880acaab Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:57:01 -0700 Subject: [PATCH 08/38] mm/memory_hotplug: drop MHP_MEMBLOCK_API No longer needed, the callers of arch_add_memory() can handle this manually. Link: http://lkml.kernel.org/r/20190527111152.16324-9-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Wei Yang Acked-by: Michal Hocko Cc: David Hildenbrand Cc: Oscar Salvador Cc: Pavel Tatashin Cc: Joonsoo Kim Cc: Qian Cai Cc: Arun KS Cc: Mathieu Malaterre Cc: Mike Rapoport Cc: Alex Deucher Cc: Andrew Banman Cc: Andy Lutomirski Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Baoquan He Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Chintan Pandya Cc: Christophe Leroy Cc: Chris Wilson Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Fenghua Yu Cc: Greg Kroah-Hartman Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jonathan Cameron Cc: Jun Yao Cc: "Kirill A. Shutemov" Cc: Logan Gunthorpe Cc: Mark Brown Cc: Mark Rutland Cc: Masahiro Yamada Cc: Michael Ellerman Cc: Mike Rapoport Cc: "mike.travis@hpe.com" Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Paul Mackerras Cc: Peter Zijlstra Cc: "Rafael J. Wysocki" Cc: Rich Felker Cc: Rob Herring Cc: Robin Murphy Cc: Thomas Gleixner Cc: Tony Luck Cc: Vasily Gorbik Cc: Will Deacon Cc: Yoshinori Sato Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 8 -------- mm/memory_hotplug.c | 9 +++------ 2 files changed, 3 insertions(+), 14 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 87bf9c4a889e..36c514b80cf1 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -128,14 +128,6 @@ extern void arch_remove_memory(int nid, u64 start, u64 size, extern void __remove_pages(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap); -/* - * Do we want sysfs memblock files created. This will allow userspace to online - * and offline memory explicitly. Lack of this bit means that the caller has to - * call move_pfn_range_to_zone to finish the initialization. - */ - -#define MHP_MEMBLOCK_API (1<<0) - /* reasonably generic interface to expand the physical pages */ extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, struct mhp_restrictions *restrictions); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 78291526eb4d..fb9dc3fa1138 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -251,7 +251,7 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ static int __meminit __add_section(int nid, unsigned long phys_start_pfn, - struct vmem_altmap *altmap, bool want_memblock) + struct vmem_altmap *altmap) { int ret; @@ -294,8 +294,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn, } for (i = start_sec; i <= end_sec; i++) { - err = __add_section(nid, section_nr_to_pfn(i), altmap, - restrictions->flags & MHP_MEMBLOCK_API); + err = __add_section(nid, section_nr_to_pfn(i), altmap); /* * EEXIST is finally dealt with by ioresource collision @@ -1065,9 +1064,7 @@ static int online_memory_block(struct memory_block *mem, void *arg) */ int __ref add_memory_resource(int nid, struct resource *res) { - struct mhp_restrictions restrictions = { - .flags = MHP_MEMBLOCK_API, - }; + struct mhp_restrictions restrictions = {}; u64 start, size; bool new_node = false; int ret; From 4c4b7f9ba9486c565aead99a198ceeef73ae81f6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:57:06 -0700 Subject: [PATCH 09/38] mm/memory_hotplug: remove memory block devices before arch_remove_memory() Let's factor out removing of memory block devices, which is only necessary for memory added via add_memory() and friends that created memory block devices. Remove the devices before calling arch_remove_memory(). This finishes factoring out memory block device handling from arch_add_memory() and arch_remove_memory(). Link: http://lkml.kernel.org/r/20190527111152.16324-10-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Dan Williams Acked-by: Michal Hocko Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: David Hildenbrand Cc: "mike.travis@hpe.com" Cc: Andrew Banman Cc: Ingo Molnar Cc: Alex Deucher Cc: "David S. Miller" Cc: Mark Brown Cc: Chris Wilson Cc: Oscar Salvador Cc: Jonathan Cameron Cc: Arun KS Cc: Mathieu Malaterre Cc: Andy Lutomirski Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Baoquan He Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Chintan Pandya Cc: Christophe Leroy Cc: Dave Hansen Cc: Fenghua Yu Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Joonsoo Kim Cc: Jun Yao Cc: "Kirill A. Shutemov" Cc: Logan Gunthorpe Cc: Mark Rutland Cc: Masahiro Yamada Cc: Michael Ellerman Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Paul Mackerras Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Qian Cai Cc: Rich Felker Cc: Rob Herring Cc: Robin Murphy Cc: Thomas Gleixner Cc: Tony Luck Cc: Vasily Gorbik Cc: Wei Yang Cc: Will Deacon Cc: Yoshinori Sato Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 37 ++++++++++++++++++------------------- drivers/base/node.c | 11 ++++++----- include/linux/memory.h | 2 +- include/linux/node.h | 6 ++---- mm/memory_hotplug.c | 5 +++-- 5 files changed, 30 insertions(+), 31 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 18a30c3ac0ef..826dd76f662e 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -759,32 +759,31 @@ int create_memory_block_devices(unsigned long start, unsigned long size) return ret; } -void unregister_memory_section(struct mem_section *section) +/* + * Remove memory block devices for the given memory area. Start and size + * have to be aligned to memory block granularity. Memory block devices + * have to be offline. + */ +void remove_memory_block_devices(unsigned long start, unsigned long size) { + const int start_block_id = pfn_to_block_id(PFN_DOWN(start)); + const int end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); struct memory_block *mem; + int block_id; - if (WARN_ON_ONCE(!present_section(section))) + if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) || + !IS_ALIGNED(size, memory_block_size_bytes()))) return; mutex_lock(&mem_sysfs_mutex); - - /* - * Some users of the memory hotplug do not want/need memblock to - * track all sections. Skip over those. - */ - mem = find_memory_block(section); - if (!mem) - goto out_unlock; - - unregister_mem_sect_under_nodes(mem, __section_nr(section)); - - mem->section_count--; - if (mem->section_count == 0) + for (block_id = start_block_id; block_id != end_block_id; block_id++) { + mem = find_memory_block_by_id(block_id, NULL); + if (WARN_ON_ONCE(!mem)) + continue; + mem->section_count = 0; + unregister_memory_block_under_nodes(mem); unregister_memory(mem); - else - put_device(&mem->dev); - -out_unlock: + } mutex_unlock(&mem_sysfs_mutex); } diff --git a/drivers/base/node.c b/drivers/base/node.c index aa878fbcf705..0b0f38c2c7cd 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -802,9 +802,10 @@ int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg) return 0; } -/* unregister memory section under all nodes that it spans */ -int unregister_mem_sect_under_nodes(struct memory_block *mem_blk, - unsigned long phys_index) +/* + * Unregister memory block device under all nodes that it spans. + */ +int unregister_memory_block_under_nodes(struct memory_block *mem_blk) { NODEMASK_ALLOC(nodemask_t, unlinked_nodes, GFP_KERNEL); unsigned long pfn, sect_start_pfn, sect_end_pfn; @@ -817,8 +818,8 @@ int unregister_mem_sect_under_nodes(struct memory_block *mem_blk, return -ENOMEM; nodes_clear(*unlinked_nodes); - sect_start_pfn = section_nr_to_pfn(phys_index); - sect_end_pfn = sect_start_pfn + PAGES_PER_SECTION - 1; + sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr); + sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr); for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) { int nid; diff --git a/include/linux/memory.h b/include/linux/memory.h index db3e8567f900..f26a5417ec5d 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -112,7 +112,7 @@ extern void unregister_memory_notifier(struct notifier_block *nb); extern int register_memory_isolate_notifier(struct notifier_block *nb); extern void unregister_memory_isolate_notifier(struct notifier_block *nb); int create_memory_block_devices(unsigned long start, unsigned long size); -extern void unregister_memory_section(struct mem_section *); +void remove_memory_block_devices(unsigned long start, unsigned long size); extern int memory_dev_init(void); extern int memory_notify(unsigned long val, void *v); extern int memory_isolate_notify(unsigned long val, void *v); diff --git a/include/linux/node.h b/include/linux/node.h index 1a557c589ecb..02a29e71b175 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -139,8 +139,7 @@ extern int register_cpu_under_node(unsigned int cpu, unsigned int nid); extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid); extern int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg); -extern int unregister_mem_sect_under_nodes(struct memory_block *mem_blk, - unsigned long phys_index); +extern int unregister_memory_block_under_nodes(struct memory_block *mem_blk); extern int register_memory_node_under_compute_node(unsigned int mem_nid, unsigned int cpu_nid, @@ -176,8 +175,7 @@ static inline int register_mem_sect_under_node(struct memory_block *mem_blk, { return 0; } -static inline int unregister_mem_sect_under_nodes(struct memory_block *mem_blk, - unsigned long phys_index) +static inline int unregister_memory_block_under_nodes(struct memory_block *mem_blk) { return 0; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fb9dc3fa1138..37c861e7a717 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -520,8 +520,6 @@ static void __remove_section(struct zone *zone, struct mem_section *ms, if (WARN_ON_ONCE(!valid_section(ms))) return; - unregister_memory_section(ms); - scn_nr = __section_nr(ms); start_pfn = section_nr_to_pfn((unsigned long)scn_nr); __remove_zone(zone, start_pfn); @@ -1834,6 +1832,9 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) memblock_free(start, size); memblock_remove(start, size); + /* remove memory block devices before removing memory */ + remove_memory_block_devices(start, size); + arch_remove_memory(nid, start, size, NULL); __release_memory_resource(start, size); From a31b264c2b415b29660da0bc2ba291a98629ce51 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:57:12 -0700 Subject: [PATCH 10/38] mm/memory_hotplug: make unregister_memory_block_under_nodes() never fail We really don't want anything during memory hotunplug to fail. We always pass a valid memory block device, that check can go. Avoid allocating memory and eventually failing. As we are always called under lock, we can use a static piece of memory. This avoids having to put the structure onto the stack, having to guess about the stack size of callers. Patch inspired by a patch from Oscar Salvador. In the future, there might be no need to iterate over nodes at all. mem->nid should tell us exactly what to remove. Memory block devices with mixed nodes (added during boot) should properly fenced off and never removed. Link: http://lkml.kernel.org/r/20190527111152.16324-11-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Wei Yang Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Alex Deucher Cc: "David S. Miller" Cc: Mark Brown Cc: Chris Wilson Cc: David Hildenbrand Cc: Jonathan Cameron Cc: Andrew Banman Cc: Andy Lutomirski Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Arun KS Cc: Baoquan He Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Chintan Pandya Cc: Christophe Leroy Cc: Dan Williams Cc: Dave Hansen Cc: Fenghua Yu Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joonsoo Kim Cc: Jun Yao Cc: "Kirill A. Shutemov" Cc: Logan Gunthorpe Cc: Mark Rutland Cc: Masahiro Yamada Cc: Mathieu Malaterre Cc: Michael Ellerman Cc: Mike Rapoport Cc: "mike.travis@hpe.com" Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Qian Cai Cc: Rich Felker Cc: Rob Herring Cc: Robin Murphy Cc: Thomas Gleixner Cc: Tony Luck Cc: Vasily Gorbik Cc: Will Deacon Cc: Yoshinori Sato Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 18 +++++------------- include/linux/node.h | 5 ++--- 2 files changed, 7 insertions(+), 16 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index 0b0f38c2c7cd..beec80649b33 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -804,20 +804,14 @@ int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg) /* * Unregister memory block device under all nodes that it spans. + * Has to be called with mem_sysfs_mutex held (due to unlinked_nodes). */ -int unregister_memory_block_under_nodes(struct memory_block *mem_blk) +void unregister_memory_block_under_nodes(struct memory_block *mem_blk) { - NODEMASK_ALLOC(nodemask_t, unlinked_nodes, GFP_KERNEL); unsigned long pfn, sect_start_pfn, sect_end_pfn; + static nodemask_t unlinked_nodes; - if (!mem_blk) { - NODEMASK_FREE(unlinked_nodes); - return -EFAULT; - } - if (!unlinked_nodes) - return -ENOMEM; - nodes_clear(*unlinked_nodes); - + nodes_clear(unlinked_nodes); sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr); sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr); for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) { @@ -828,15 +822,13 @@ int unregister_memory_block_under_nodes(struct memory_block *mem_blk) continue; if (!node_online(nid)) continue; - if (node_test_and_set(nid, *unlinked_nodes)) + if (node_test_and_set(nid, unlinked_nodes)) continue; sysfs_remove_link(&node_devices[nid]->dev.kobj, kobject_name(&mem_blk->dev.kobj)); sysfs_remove_link(&mem_blk->dev.kobj, kobject_name(&node_devices[nid]->dev.kobj)); } - NODEMASK_FREE(unlinked_nodes); - return 0; } int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn) diff --git a/include/linux/node.h b/include/linux/node.h index 02a29e71b175..548c226966a2 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -139,7 +139,7 @@ extern int register_cpu_under_node(unsigned int cpu, unsigned int nid); extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid); extern int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg); -extern int unregister_memory_block_under_nodes(struct memory_block *mem_blk); +extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk); extern int register_memory_node_under_compute_node(unsigned int mem_nid, unsigned int cpu_nid, @@ -175,9 +175,8 @@ static inline int register_mem_sect_under_node(struct memory_block *mem_blk, { return 0; } -static inline int unregister_memory_block_under_nodes(struct memory_block *mem_blk) +static inline void unregister_memory_block_under_nodes(struct memory_block *mem_blk) { - return 0; } static inline void register_hugetlbfs_with_node(node_registration_func_t reg, From b9bf8d342d9b443c0d19aa57883d8ddb38d965de Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:57:17 -0700 Subject: [PATCH 11/38] mm/memory_hotplug: remove "zone" parameter from sparse_remove_one_section The parameter is unused, so let's drop it. Memory removal paths should never care about zones. This is the job of memory offlining and will require more refactorings. Link: http://lkml.kernel.org/r/20190527111152.16324-12-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Dan Williams Reviewed-by: Wei Yang Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Cc: Alex Deucher Cc: Andrew Banman Cc: Andy Lutomirski Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Arun KS Cc: Baoquan He Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Chintan Pandya Cc: Christophe Leroy Cc: Chris Wilson Cc: Dave Hansen Cc: "David S. Miller" Cc: Fenghua Yu Cc: Greg Kroah-Hartman Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jonathan Cameron Cc: Joonsoo Kim Cc: Jun Yao Cc: "Kirill A. Shutemov" Cc: Logan Gunthorpe Cc: Mark Brown Cc: Mark Rutland Cc: Masahiro Yamada Cc: Mathieu Malaterre Cc: Michael Ellerman Cc: Mike Rapoport Cc: "mike.travis@hpe.com" Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Qian Cai Cc: "Rafael J. Wysocki" Cc: Rich Felker Cc: Rob Herring Cc: Robin Murphy Cc: Thomas Gleixner Cc: Tony Luck Cc: Vasily Gorbik Cc: Will Deacon Cc: Yoshinori Sato Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 2 +- mm/memory_hotplug.c | 2 +- mm/sparse.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 36c514b80cf1..79e0add6a597 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -350,7 +350,7 @@ extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, extern bool is_memblock_offlined(struct memory_block *mem); extern int sparse_add_one_section(int nid, unsigned long start_pfn, struct vmem_altmap *altmap); -extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, +extern void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset, struct vmem_altmap *altmap); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 37c861e7a717..d1d0ceaaca88 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -524,7 +524,7 @@ static void __remove_section(struct zone *zone, struct mem_section *ms, start_pfn = section_nr_to_pfn((unsigned long)scn_nr); __remove_zone(zone, start_pfn); - sparse_remove_one_section(zone, ms, map_offset, altmap); + sparse_remove_one_section(ms, map_offset, altmap); } /** diff --git a/mm/sparse.c b/mm/sparse.c index d1d5e05f5b8d..1552c855d62a 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -800,8 +800,8 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap, free_map_bootmem(memmap); } -void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, - unsigned long map_offset, struct vmem_altmap *altmap) +void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset, + struct vmem_altmap *altmap) { struct page *memmap = NULL; unsigned long *usemap = NULL; From 26f26bedab337c9c7e1e55b21949a3e2e0d62840 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 18 Jul 2019 15:57:21 -0700 Subject: [PATCH 12/38] mm/sparse.c: set section nid for hot-add memory In case of NODE_NOT_IN_PAGE_FLAGS is set, we store section's node id in section_to_node_table[]. While for hot-add memory, this is missed. Without this information, page_to_nid() may not give the right node id. BTW, current online_pages works because it leverages nid in memory_block. But the granularity of node id should be mem_section wide. Link: http://lkml.kernel.org/r/20190618005537.18878-1-richardw.yang@linux.intel.com Signed-off-by: Wei Yang Reviewed-by: Oscar Salvador Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/sparse.c b/mm/sparse.c index 1552c855d62a..fe44b2d3bd7e 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -731,6 +731,7 @@ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn, */ page_init_poison(memmap, sizeof(struct page) * PAGES_PER_SECTION); + set_section_nid(section_nr, nid); section_mark_present(ms); sparse_init_one_section(ms, section_nr, memmap, usemap); From 43675e6fbbeadca90c6c5031557ff95e217e6d2f Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 18 Jul 2019 15:57:24 -0700 Subject: [PATCH 13/38] mm: thp: make transhuge_vma_suitable available for anonymous THP transhuge_vma_suitable() was only available for shmem THP, but anonymous THP has the same check except pgoff check. And, it will be used for THP eligible check in the later patch, so make it available for all kind of THPs. This also helps reduce code duplication slightly. Since anonymous THP doesn't have to check pgoff, so make pgoff check shmem vma only. And regroup some functions in include/linux/mm.h to solve compile issue since transhuge_vma_suitable() needs call vma_is_anonymous() which was defined after huge_mm.h is included. [akpm@linux-foundation.org: fix typo] [yang.shi@linux.alibaba.com: v4] Link: http://lkml.kernel.org/r/1563400758-124759-2-git-send-email-yang.shi@linux.alibaba.com Link: http://lkml.kernel.org/r/1560401041-32207-2-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Acked-by: Hugh Dickins Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Vlastimil Babka Cc: David Rientjes Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 23 +++++++++++++++++++++++ include/linux/mm.h | 34 +++++++++++++++++----------------- mm/huge_memory.c | 2 +- mm/memory.c | 13 ------------- 4 files changed, 41 insertions(+), 31 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 7cd5c150c21d..45ede62aa85b 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -121,6 +121,23 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) bool transparent_hugepage_enabled(struct vm_area_struct *vma); +#define HPAGE_CACHE_INDEX_MASK (HPAGE_PMD_NR - 1) + +static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, + unsigned long haddr) +{ + /* Don't have to check pgoff for anonymous vma */ + if (!vma_is_anonymous(vma)) { + if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) != + (vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK)) + return false; + } + + if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) + return false; + return true; +} + #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ (1<vm_ops = NULL; } +static inline bool vma_is_anonymous(struct vm_area_struct *vma) +{ + return !vma->vm_ops; +} + +#ifdef CONFIG_SHMEM +/* + * The vma_is_shmem is not inline because it is used only by slow + * paths in userfault. + */ +bool vma_is_shmem(struct vm_area_struct *vma); +#else +static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; } +#endif + +int vma_is_stack_for_current(struct vm_area_struct *vma); + /* flush_tlb_range() takes a vma, not a mm, and can care about flags */ #define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) } @@ -1620,23 +1637,6 @@ int clear_page_dirty_for_io(struct page *page); int get_cmdline(struct task_struct *task, char *buffer, int buflen); -static inline bool vma_is_anonymous(struct vm_area_struct *vma) -{ - return !vma->vm_ops; -} - -#ifdef CONFIG_SHMEM -/* - * The vma_is_shmem is not inline because it is used only by slow - * paths in userfault. - */ -bool vma_is_shmem(struct vm_area_struct *vma); -#else -static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; } -#endif - -int vma_is_stack_for_current(struct vm_area_struct *vma); - extern unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 885642c82aaa..782dd1446a3e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -689,7 +689,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) struct page *page; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) + if (!transhuge_vma_suitable(vma, haddr)) return VM_FAULT_FALLBACK; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; diff --git a/mm/memory.c b/mm/memory.c index 89325f9c6173..e2bb51b6242e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3162,19 +3162,6 @@ map_pte: } #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE - -#define HPAGE_CACHE_INDEX_MASK (HPAGE_PMD_NR - 1) -static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, - unsigned long haddr) -{ - if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) != - (vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK)) - return false; - if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) - return false; - return true; -} - static void deposit_prealloc_pte(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; From c06306696f8368b08774e2a743dbc52d92a61693 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 18 Jul 2019 15:57:27 -0700 Subject: [PATCH 14/38] mm: thp: fix false negative of shmem vma's THP eligibility Commit 7635d9cbe832 ("mm, thp, proc: report THP eligibility for each vma") introduced THPeligible bit for processes' smaps. But, when checking the eligibility for shmem vma, __transparent_hugepage_enabled() is called to override the result from shmem_huge_enabled(). It may result in the anonymous vma's THP flag override shmem's. For example, running a simple test which create THP for shmem, but with anonymous THP disabled, when reading the process's smaps, it may show: 7fc92ec00000-7fc92f000000 rw-s 00000000 00:14 27764 /dev/shm/test Size: 4096 kB ... [snip] ... ShmemPmdMapped: 4096 kB ... [snip] ... THPeligible: 0 And, /proc/meminfo does show THP allocated and PMD mapped too: ShmemHugePages: 4096 kB ShmemPmdMapped: 4096 kB This doesn't make too much sense. The shmem objects should be treated separately from anonymous THP. Calling shmem_huge_enabled() with checking MMF_DISABLE_THP sounds good enough. And, we could skip stack and dax vma check since we already checked if the vma is shmem already. Also check if vma is suitable for THP by calling transhuge_vma_suitable(). And minor fix to smaps output format and documentation. Link: http://lkml.kernel.org/r/1560401041-32207-3-git-send-email-yang.shi@linux.alibaba.com Fixes: 7635d9cbe832 ("mm, thp, proc: report THP eligibility for each vma") Signed-off-by: Yang Shi Acked-by: Hugh Dickins Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Vlastimil Babka Cc: David Rientjes Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 4 ++-- fs/proc/task_mmu.c | 3 ++- mm/huge_memory.c | 9 +++++++-- mm/shmem.c | 3 +++ 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index fb4735fd73b0..99ca040e3f90 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -486,8 +486,8 @@ replaced by copy-on-write) part of the underlying shmem object out on swap. "SwapPss" shows proportional swap share of this mapping. Unlike "Swap", this does not take into account swapped out page of underlying shmem objects. "Locked" indicates whether the mapping is locked in memory or not. -"THPeligible" indicates whether the mapping is eligible for THP pages - 1 if -true, 0 otherwise. +"THPeligible" indicates whether the mapping is eligible for allocating THP +pages - 1 if true, 0 otherwise. It just shows the current status. "VmFlags" field deserves a separate description. This member represents the kernel flags associated with the particular virtual memory area in two letter encoded diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 818cedbed95f..731642e0f5a0 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -832,7 +832,8 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); - seq_printf(m, "THPeligible: %d\n", transparent_hugepage_enabled(vma)); + seq_printf(m, "THPeligible: %d\n", + transparent_hugepage_enabled(vma)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 782dd1446a3e..1334ede667a8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -63,10 +63,15 @@ struct page *huge_zero_page __read_mostly; bool transparent_hugepage_enabled(struct vm_area_struct *vma) { + /* The addr is used to check if the vma size fits */ + unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE; + + if (!transhuge_vma_suitable(vma, addr)) + return false; if (vma_is_anonymous(vma)) return __transparent_hugepage_enabled(vma); - if (vma_is_shmem(vma) && shmem_huge_enabled(vma)) - return __transparent_hugepage_enabled(vma); + if (vma_is_shmem(vma)) + return shmem_huge_enabled(vma); return false; } diff --git a/mm/shmem.c b/mm/shmem.c index 99497cb32e71..c88a30919ae5 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3874,6 +3874,9 @@ bool shmem_huge_enabled(struct vm_area_struct *vma) loff_t i_size; pgoff_t off; + if ((vma->vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) + return false; if (shmem_huge == SHMEM_HUGE_FORCE) return true; if (shmem_huge == SHMEM_HUGE_DENY) From 49f17c26c123b60fd1c74629eef077740d16ffc2 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 18 Jul 2019 15:57:31 -0700 Subject: [PATCH 15/38] resource: fix locking in find_next_iomem_res() Since resources can be removed, locking should ensure that the resource is not removed while accessing it. However, find_next_iomem_res() does not hold the lock while copying the data of the resource. Keep holding the lock while the data is copied. While at it, change the return value to a more informative value. It is disregarded by the callers. [akpm@linux-foundation.org: fix find_next_iomem_res() documentation] Link: http://lkml.kernel.org/r/20190613045903.4922-2-namit@vmware.com Fixes: ff3cc952d3f00 ("resource: Add remove_resource interface") Signed-off-by: Nadav Amit Reviewed-by: Andrew Morton Reviewed-by: Dan Williams Cc: Borislav Petkov Cc: Toshi Kani Cc: Peter Zijlstra Cc: Dave Hansen Cc: Bjorn Helgaas Cc: Ingo Molnar Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index d22423e85cf8..3ced0cd45bdd 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -326,7 +326,7 @@ EXPORT_SYMBOL(release_resource); * * If a resource is found, returns 0 and @*res is overwritten with the part * of the resource that's within [@start..@end]; if none is found, returns - * -1 or -EINVAL for other invalid parameters. + * -ENODEV. Returns -EINVAL for invalid parameters. * * This function walks the whole tree and not just first level children * unless @first_lvl is true. @@ -365,16 +365,16 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, break; } - read_unlock(&resource_lock); - if (!p) - return -1; + if (p) { + /* copy data */ + res->start = max(start, p->start); + res->end = min(end, p->end); + res->flags = p->flags; + res->desc = p->desc; + } - /* copy data */ - res->start = max(start, p->start); - res->end = min(end, p->end); - res->flags = p->flags; - res->desc = p->desc; - return 0; + read_unlock(&resource_lock); + return p ? 0 : -ENODEV; } static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, From 756398750e11ade1e617cd2a8f8d66fe7ed637e1 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 18 Jul 2019 15:57:34 -0700 Subject: [PATCH 16/38] resource: avoid unnecessary lookups in find_next_iomem_res() find_next_iomem_res() shows up to be a source for overhead in dax benchmarks. Improve performance by not considering children of the tree if the top level does not match. Since the range of the parents should include the range of the children such check is redundant. Running sysbench on dax (pmem emulation, with write_cache disabled): sysbench fileio --file-total-size=3G --file-test-mode=rndwr \ --file-io-mode=mmap --threads=4 --file-fsync-mode=fdatasync run Provides the following results: events (avg/stddev) ------------------- 5.2-rc3: 1247669.0000/16075.39 w/patch: 1286320.5000/16402.72 (+3%) Link: http://lkml.kernel.org/r/20190613045903.4922-3-namit@vmware.com Signed-off-by: Nadav Amit Cc: Borislav Petkov Cc: Toshi Kani Cc: Peter Zijlstra Cc: Dave Hansen Cc: Dan Williams Cc: Bjorn Helgaas Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index 3ced0cd45bdd..7ea4306503c5 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -342,6 +342,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, unsigned long flags, unsigned long desc, bool first_lvl, struct resource *res) { + bool siblings_only = true; struct resource *p; if (!res) @@ -352,17 +353,31 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, read_lock(&resource_lock); - for (p = iomem_resource.child; p; p = next_resource(p, first_lvl)) { - if ((p->flags & flags) != flags) - continue; - if ((desc != IORES_DESC_NONE) && (desc != p->desc)) - continue; + for (p = iomem_resource.child; p; p = next_resource(p, siblings_only)) { + /* If we passed the resource we are looking for, stop */ if (p->start > end) { p = NULL; break; } - if ((p->end >= start) && (p->start <= end)) - break; + + /* Skip until we find a range that matches what we look for */ + if (p->end < start) + continue; + + /* + * Now that we found a range that matches what we look for, + * check the flags and the descriptor. If we were not asked to + * use only the first level, start looking at children as well. + */ + siblings_only = first_lvl; + + if ((p->flags & flags) != flags) + continue; + if ((desc != IORES_DESC_NONE) && (desc != p->desc)) + continue; + + /* Found a match, break */ + break; } if (p) { From 2491f0a2c0b117b9097e9c9eee0c21f2e5f716d7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:57:37 -0700 Subject: [PATCH 17/38] mm: section numbers use the type "unsigned long" Patch series "mm: Further memory block device cleanups", v1. Some further cleanups around memory block devices. Especially, clean up and simplify walk_memory_range(). Including some other minor cleanups. This patch (of 6): We are using a mixture of "int" and "unsigned long". Let's make this consistent by using "unsigned long" everywhere. We'll do the same with memory block ids next. While at it, turn the "unsigned long i" in removable_show() into an int - sections_per_block is an int. [akpm@linux-foundation.org: s/unsigned long i/unsigned long nr/] [david@redhat.com: v3] Link: http://lkml.kernel.org/r/20190620183139.4352-2-david@redhat.com Link: http://lkml.kernel.org/r/20190614100114.311-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Andrew Morton Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Vlastimil Babka Cc: Michal Hocko Cc: Dan Williams Cc: Mel Gorman Cc: Wei Yang Cc: Johannes Weiner Cc: Arun KS Cc: Pavel Tatashin Cc: Oscar Salvador Cc: Stephen Rothwell Cc: Mike Rapoport Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 27 +++++++++++++-------------- include/linux/mmzone.h | 4 ++-- mm/sparse.c | 12 ++++++------ 3 files changed, 21 insertions(+), 22 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 826dd76f662e..5947b5a5686d 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -34,7 +34,7 @@ static DEFINE_MUTEX(mem_sysfs_mutex); static int sections_per_block; -static inline int base_memory_block_id(int section_nr) +static inline int base_memory_block_id(unsigned long section_nr) { return section_nr / sections_per_block; } @@ -131,9 +131,9 @@ static ssize_t phys_index_show(struct device *dev, static ssize_t removable_show(struct device *dev, struct device_attribute *attr, char *buf) { - unsigned long i, pfn; - int ret = 1; struct memory_block *mem = to_memory_block(dev); + unsigned long pfn; + int ret = 1, i; if (mem->state != MEM_ONLINE) goto out; @@ -691,15 +691,15 @@ static int init_memory_block(struct memory_block **memory, int block_id, return ret; } -static int add_memory_block(int base_section_nr) +static int add_memory_block(unsigned long base_section_nr) { + int ret, section_count = 0; struct memory_block *mem; - int i, ret, section_count = 0; + unsigned long nr; - for (i = base_section_nr; - i < base_section_nr + sections_per_block; - i++) - if (present_section_nr(i)) + for (nr = base_section_nr; nr < base_section_nr + sections_per_block; + nr++) + if (present_section_nr(nr)) section_count++; if (section_count == 0) @@ -822,10 +822,9 @@ static const struct attribute_group *memory_root_attr_groups[] = { */ int __init memory_dev_init(void) { - unsigned int i; int ret; int err; - unsigned long block_sz; + unsigned long block_sz, nr; ret = subsys_system_register(&memory_subsys, memory_root_attr_groups); if (ret) @@ -839,9 +838,9 @@ int __init memory_dev_init(void) * during boot and have been initialized */ mutex_lock(&mem_sysfs_mutex); - for (i = 0; i <= __highest_present_section_nr; - i += sections_per_block) { - err = add_memory_block(i); + for (nr = 0; nr <= __highest_present_section_nr; + nr += sections_per_block) { + err = add_memory_block(nr); if (!ret) ret = err; } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 70394cabaf4e..298d1c3e4c2e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1219,7 +1219,7 @@ static inline struct mem_section *__nr_to_section(unsigned long nr) return NULL; return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; } -extern int __section_nr(struct mem_section* ms); +extern unsigned long __section_nr(struct mem_section *ms); extern unsigned long usemap_size(void); /* @@ -1291,7 +1291,7 @@ static inline struct mem_section *__pfn_to_section(unsigned long pfn) return __nr_to_section(pfn_to_section_nr(pfn)); } -extern int __highest_present_section_nr; +extern unsigned long __highest_present_section_nr; #ifndef CONFIG_HAVE_ARCH_PFN_VALID static inline int pfn_valid(unsigned long pfn) diff --git a/mm/sparse.c b/mm/sparse.c index fe44b2d3bd7e..b29534cea8c0 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -102,7 +102,7 @@ static inline int sparse_index_init(unsigned long section_nr, int nid) #endif #ifdef CONFIG_SPARSEMEM_EXTREME -int __section_nr(struct mem_section* ms) +unsigned long __section_nr(struct mem_section *ms) { unsigned long root_nr; struct mem_section *root = NULL; @@ -121,9 +121,9 @@ int __section_nr(struct mem_section* ms) return (root_nr * SECTIONS_PER_ROOT) + (ms - root); } #else -int __section_nr(struct mem_section* ms) +unsigned long __section_nr(struct mem_section *ms) { - return (int)(ms - mem_section[0]); + return (unsigned long)(ms - mem_section[0]); } #endif @@ -178,10 +178,10 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, * Keeping track of this gives us an easy way to break out of * those loops early. */ -int __highest_present_section_nr; +unsigned long __highest_present_section_nr; static void section_mark_present(struct mem_section *ms) { - int section_nr = __section_nr(ms); + unsigned long section_nr = __section_nr(ms); if (section_nr > __highest_present_section_nr) __highest_present_section_nr = section_nr; @@ -189,7 +189,7 @@ static void section_mark_present(struct mem_section *ms) ms->section_mem_map |= SECTION_MARKED_PRESENT; } -static inline int next_present_section_nr(int section_nr) +static inline unsigned long next_present_section_nr(unsigned long section_nr) { do { section_nr++; From 90ec010fe0d690665852d6bac21643e9ae7affd8 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:57:40 -0700 Subject: [PATCH 18/38] drivers/base/memory: use "unsigned long" for block ids Block ids are just shifted section numbers, so let's also use "unsigned long" for them, too. Link: http://lkml.kernel.org/r/20190614100114.311-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Andrew Morton Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 5947b5a5686d..c54e80fd25a8 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -34,12 +34,12 @@ static DEFINE_MUTEX(mem_sysfs_mutex); static int sections_per_block; -static inline int base_memory_block_id(unsigned long section_nr) +static inline unsigned long base_memory_block_id(unsigned long section_nr) { return section_nr / sections_per_block; } -static inline int pfn_to_block_id(unsigned long pfn) +static inline unsigned long pfn_to_block_id(unsigned long pfn) { return base_memory_block_id(pfn_to_section_nr(pfn)); } @@ -587,7 +587,7 @@ int __weak arch_get_memory_phys_device(unsigned long start_pfn) * A reference for the returned object is held and the reference for the * hinted object is released. */ -static struct memory_block *find_memory_block_by_id(int block_id, +static struct memory_block *find_memory_block_by_id(unsigned long block_id, struct memory_block *hint) { struct device *hintdev = hint ? &hint->dev : NULL; @@ -604,7 +604,7 @@ static struct memory_block *find_memory_block_by_id(int block_id, struct memory_block *find_memory_block_hinted(struct mem_section *section, struct memory_block *hint) { - int block_id = base_memory_block_id(__section_nr(section)); + unsigned long block_id = base_memory_block_id(__section_nr(section)); return find_memory_block_by_id(block_id, hint); } @@ -663,8 +663,8 @@ int register_memory(struct memory_block *memory) return ret; } -static int init_memory_block(struct memory_block **memory, int block_id, - unsigned long state) +static int init_memory_block(struct memory_block **memory, + unsigned long block_id, unsigned long state) { struct memory_block *mem; unsigned long start_pfn; @@ -729,8 +729,8 @@ static void unregister_memory(struct memory_block *memory) */ int create_memory_block_devices(unsigned long start, unsigned long size) { - const int start_block_id = pfn_to_block_id(PFN_DOWN(start)); - int end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); + const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); + unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); struct memory_block *mem; unsigned long block_id; int ret = 0; @@ -766,10 +766,10 @@ int create_memory_block_devices(unsigned long start, unsigned long size) */ void remove_memory_block_devices(unsigned long start, unsigned long size) { - const int start_block_id = pfn_to_block_id(PFN_DOWN(start)); - const int end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); + const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); + const unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); struct memory_block *mem; - int block_id; + unsigned long block_id; if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) || !IS_ALIGNED(size, memory_block_size_bytes()))) From 8d595c4c0f768f19db043d378b22e98405f9fd47 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:57:43 -0700 Subject: [PATCH 19/38] mm: make register_mem_sect_under_node() static It is only used internally. Link: http://lkml.kernel.org/r/20190614100114.311-4-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Andrew Morton Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Keith Busch Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 3 ++- include/linux/node.h | 7 ------- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index beec80649b33..27391f1e8f60 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -753,7 +753,8 @@ static int __ref get_nid_for_pfn(unsigned long pfn) } /* register memory section under specified node if it spans that node */ -int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg) +static int register_mem_sect_under_node(struct memory_block *mem_blk, + void *arg) { int ret, nid = *(int *)arg; unsigned long pfn, sect_start_pfn, sect_end_pfn; diff --git a/include/linux/node.h b/include/linux/node.h index 548c226966a2..4866f32a02d8 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -137,8 +137,6 @@ static inline int register_one_node(int nid) extern void unregister_one_node(int nid); extern int register_cpu_under_node(unsigned int cpu, unsigned int nid); extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid); -extern int register_mem_sect_under_node(struct memory_block *mem_blk, - void *arg); extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk); extern int register_memory_node_under_compute_node(unsigned int mem_nid, @@ -170,11 +168,6 @@ static inline int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) { return 0; } -static inline int register_mem_sect_under_node(struct memory_block *mem_blk, - void *arg) -{ - return 0; -} static inline void unregister_memory_block_under_nodes(struct memory_block *mem_blk) { } From fbcf73ce65827c3d8935f38b832a43153a0c78d1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:57:46 -0700 Subject: [PATCH 20/38] mm/memory_hotplug: rename walk_memory_range() and pass start+size instead of pfns walk_memory_range() was once used to iterate over sections. Now, it iterates over memory blocks. Rename the function, fixup the documentation. Also, pass start+size instead of PFNs, which is what most callers already have at hand. (we'll rework link_mem_sections() most probably soon) Follow-up patches will rework, simplify, and move walk_memory_blocks() to drivers/base/memory.c. Note: walk_memory_blocks() only works correctly right now if the start_pfn is aligned to a section start. This is the case right now, but we'll generalize the function in a follow up patch so the semantics match the documentation. [akpm@linux-foundation.org: remove unused variable] Link: http://lkml.kernel.org/r/20190614100114.311-5-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Greg Kroah-Hartman Cc: David Hildenbrand Cc: Rashmica Gupta Cc: Pavel Tatashin Cc: Anshuman Khandual Cc: Michael Neuling Cc: Thomas Gleixner Cc: Oscar Salvador Cc: Michal Hocko Cc: Wei Yang Cc: Juergen Gross Cc: Qian Cai Cc: Arun KS Cc: Nick Desaulniers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/platforms/powernv/memtrace.c | 23 ++++++++++---------- drivers/acpi/acpi_memhotplug.c | 19 ++++------------- drivers/base/node.c | 5 +++-- include/linux/memory_hotplug.h | 2 +- mm/memory_hotplug.c | 26 ++++++++++++----------- 5 files changed, 33 insertions(+), 42 deletions(-) diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c index 5e53c1392d3b..eb2e75dac369 100644 --- a/arch/powerpc/platforms/powernv/memtrace.c +++ b/arch/powerpc/platforms/powernv/memtrace.c @@ -70,23 +70,23 @@ static int change_memblock_state(struct memory_block *mem, void *arg) /* called with device_hotplug_lock held */ static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages) { - u64 end_pfn = start_pfn + nr_pages - 1; + const unsigned long start = PFN_PHYS(start_pfn); + const unsigned long size = PFN_PHYS(nr_pages); - if (walk_memory_range(start_pfn, end_pfn, NULL, - check_memblock_online)) + if (walk_memory_blocks(start, size, NULL, check_memblock_online)) return false; - walk_memory_range(start_pfn, end_pfn, (void *)MEM_GOING_OFFLINE, - change_memblock_state); + walk_memory_blocks(start, size, (void *)MEM_GOING_OFFLINE, + change_memblock_state); if (offline_pages(start_pfn, nr_pages)) { - walk_memory_range(start_pfn, end_pfn, (void *)MEM_ONLINE, - change_memblock_state); + walk_memory_blocks(start, size, (void *)MEM_ONLINE, + change_memblock_state); return false; } - walk_memory_range(start_pfn, end_pfn, (void *)MEM_OFFLINE, - change_memblock_state); + walk_memory_blocks(start, size, (void *)MEM_OFFLINE, + change_memblock_state); return true; @@ -242,9 +242,8 @@ static int memtrace_online(void) */ if (!memhp_auto_online) { lock_device_hotplug(); - walk_memory_range(PFN_DOWN(ent->start), - PFN_UP(ent->start + ent->size - 1), - NULL, online_mem_block); + walk_memory_blocks(ent->start, ent->size, NULL, + online_mem_block); unlock_device_hotplug(); } diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c index db013dc21c02..e294f44a7850 100644 --- a/drivers/acpi/acpi_memhotplug.c +++ b/drivers/acpi/acpi_memhotplug.c @@ -155,16 +155,6 @@ static int acpi_memory_check_device(struct acpi_memory_device *mem_device) return 0; } -static unsigned long acpi_meminfo_start_pfn(struct acpi_memory_info *info) -{ - return PFN_DOWN(info->start_addr); -} - -static unsigned long acpi_meminfo_end_pfn(struct acpi_memory_info *info) -{ - return PFN_UP(info->start_addr + info->length-1); -} - static int acpi_bind_memblk(struct memory_block *mem, void *arg) { return acpi_bind_one(&mem->dev, arg); @@ -173,9 +163,8 @@ static int acpi_bind_memblk(struct memory_block *mem, void *arg) static int acpi_bind_memory_blocks(struct acpi_memory_info *info, struct acpi_device *adev) { - return walk_memory_range(acpi_meminfo_start_pfn(info), - acpi_meminfo_end_pfn(info), adev, - acpi_bind_memblk); + return walk_memory_blocks(info->start_addr, info->length, adev, + acpi_bind_memblk); } static int acpi_unbind_memblk(struct memory_block *mem, void *arg) @@ -186,8 +175,8 @@ static int acpi_unbind_memblk(struct memory_block *mem, void *arg) static void acpi_unbind_memory_blocks(struct acpi_memory_info *info) { - walk_memory_range(acpi_meminfo_start_pfn(info), - acpi_meminfo_end_pfn(info), NULL, acpi_unbind_memblk); + walk_memory_blocks(info->start_addr, info->length, NULL, + acpi_unbind_memblk); } static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) diff --git a/drivers/base/node.c b/drivers/base/node.c index 27391f1e8f60..75b7e6f6535b 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -834,8 +834,9 @@ void unregister_memory_block_under_nodes(struct memory_block *mem_blk) int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn) { - return walk_memory_range(start_pfn, end_pfn, (void *)&nid, - register_mem_sect_under_node); + return walk_memory_blocks(PFN_PHYS(start_pfn), + PFN_PHYS(end_pfn - start_pfn), (void *)&nid, + register_mem_sect_under_node); } #ifdef CONFIG_HUGETLBFS diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 79e0add6a597..d9fffc34949f 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -340,7 +340,7 @@ static inline void __remove_memory(int nid, u64 start, u64 size) {} #endif /* CONFIG_MEMORY_HOTREMOVE */ extern void __ref free_area_init_core_hotplug(int nid); -extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, +extern int walk_memory_blocks(unsigned long start, unsigned long size, void *arg, int (*func)(struct memory_block *, void *)); extern int __add_memory(int nid, u64 start, u64 size); extern int add_memory(int nid, u64 start, u64 size); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index d1d0ceaaca88..b3ef84e408fa 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1124,8 +1124,7 @@ int __ref add_memory_resource(int nid, struct resource *res) /* online pages if requested */ if (memhp_auto_online) - walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), - NULL, online_memory_block); + walk_memory_blocks(start, size, NULL, online_memory_block); return ret; error: @@ -1663,20 +1662,24 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages) #endif /* CONFIG_MEMORY_HOTREMOVE */ /** - * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) - * @start_pfn: start pfn of the memory range - * @end_pfn: end pfn of the memory range - * @arg: argument passed to func - * @func: callback for each memory section walked + * walk_memory_blocks - walk through all present memory blocks overlapped + * by the range [start, start + size) * - * This function walks through all present mem sections in range - * [start_pfn, end_pfn) and call func on each mem section. + * @start: start address of the memory range + * @size: size of the memory range + * @arg: argument passed to func + * @func: callback for each memory block walked + * + * This function walks through all present memory blocks overlapped by the + * range [start, start + size), calling func on each memory block. * * Returns the return value of func. */ -int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, +int walk_memory_blocks(unsigned long start, unsigned long size, void *arg, int (*func)(struct memory_block *, void *)) { + const unsigned long start_pfn = PFN_DOWN(start); + const unsigned long end_pfn = PFN_UP(start + size - 1); struct memory_block *mem = NULL; struct mem_section *section; unsigned long pfn, section_nr; @@ -1822,8 +1825,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) * whether all memory blocks in question are offline and return error * if this is not the case. */ - rc = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, - check_memblock_offlined_cb); + rc = walk_memory_blocks(start, size, NULL, check_memblock_offlined_cb); if (rc) goto done; From ea8846411ad686ff626e00bb2c3821b3db2ab56a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:57:50 -0700 Subject: [PATCH 21/38] mm/memory_hotplug: move and simplify walk_memory_blocks() Let's move walk_memory_blocks() to the place where memory block logic resides and simplify it. While at it, add a type for the callback function. Link: http://lkml.kernel.org/r/20190614100114.311-6-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Andrew Morton Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: David Hildenbrand Cc: Stephen Rothwell Cc: Pavel Tatashin Cc: Andrew Banman Cc: Mike Travis Cc: Oscar Salvador Cc: Michal Hocko Cc: Wei Yang Cc: Arun KS Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 42 ++++++++++++++++++++++++++ include/linux/memory.h | 3 ++ include/linux/memory_hotplug.h | 2 -- mm/memory_hotplug.c | 55 ---------------------------------- 4 files changed, 45 insertions(+), 57 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index c54e80fd25a8..0204384b4d1d 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -44,6 +44,11 @@ static inline unsigned long pfn_to_block_id(unsigned long pfn) return base_memory_block_id(pfn_to_section_nr(pfn)); } +static inline unsigned long phys_to_block_id(unsigned long phys) +{ + return pfn_to_block_id(PFN_DOWN(phys)); +} + static int memory_subsys_online(struct device *dev); static int memory_subsys_offline(struct device *dev); @@ -851,3 +856,40 @@ out: printk(KERN_ERR "%s() failed: %d\n", __func__, ret); return ret; } + +/** + * walk_memory_blocks - walk through all present memory blocks overlapped + * by the range [start, start + size) + * + * @start: start address of the memory range + * @size: size of the memory range + * @arg: argument passed to func + * @func: callback for each memory section walked + * + * This function walks through all present memory blocks overlapped by the + * range [start, start + size), calling func on each memory block. + * + * In case func() returns an error, walking is aborted and the error is + * returned. + */ +int walk_memory_blocks(unsigned long start, unsigned long size, + void *arg, walk_memory_blocks_func_t func) +{ + const unsigned long start_block_id = phys_to_block_id(start); + const unsigned long end_block_id = phys_to_block_id(start + size - 1); + struct memory_block *mem; + unsigned long block_id; + int ret = 0; + + for (block_id = start_block_id; block_id <= end_block_id; block_id++) { + mem = find_memory_block_by_id(block_id, NULL); + if (!mem) + continue; + + ret = func(mem, arg); + put_device(&mem->dev); + if (ret) + break; + } + return ret; +} diff --git a/include/linux/memory.h b/include/linux/memory.h index f26a5417ec5d..b3b388775a30 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -119,6 +119,9 @@ extern int memory_isolate_notify(unsigned long val, void *v); extern struct memory_block *find_memory_block_hinted(struct mem_section *, struct memory_block *); extern struct memory_block *find_memory_block(struct mem_section *); +typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *); +extern int walk_memory_blocks(unsigned long start, unsigned long size, + void *arg, walk_memory_blocks_func_t func); #define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION<= mem->start_section_nr) && - (section_nr <= mem->end_section_nr)) - continue; - - mem = find_memory_block_hinted(section, mem); - if (!mem) - continue; - - ret = func(mem, arg); - if (ret) { - kobject_put(&mem->dev.kobj); - return ret; - } - } - - if (mem) - kobject_put(&mem->dev.kobj); - - return 0; -} - -#ifdef CONFIG_MEMORY_HOTREMOVE static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) { int ret = !is_memblock_offlined(mem); From dd625285910d3cff535fa76355e49949513918a4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:57:53 -0700 Subject: [PATCH 22/38] drivers/base/memory.c: get rid of find_memory_block_hinted() No longer needed, let's remove it. Also, drop the "hint" parameter completely from "find_memory_block_by_id", as nobody needs it anymore. [david@redhat.com: v3] Link: http://lkml.kernel.org/r/20190620183139.4352-7-david@redhat.com [david@redhat.com: handle zero-length walks] Link: http://lkml.kernel.org/r/1c2edc22-afd7-2211-c4c7-40e54e5007e8@redhat.com Link: http://lkml.kernel.org/r/20190614100114.311-7-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Andrew Morton Tested-by: Qian Cai Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: David Hildenbrand Cc: Stephen Rothwell Cc: Pavel Tatashin Cc: Andrew Banman Cc: Mike Travis Cc: Oscar Salvador Cc: Michal Hocko Cc: Wei Yang Cc: Arun KS Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 40 ++++++++++++++-------------------------- include/linux/memory.h | 2 -- 2 files changed, 14 insertions(+), 28 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 0204384b4d1d..20c39d1bcef8 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -588,30 +588,13 @@ int __weak arch_get_memory_phys_device(unsigned long start_pfn) return 0; } -/* - * A reference for the returned object is held and the reference for the - * hinted object is released. - */ -static struct memory_block *find_memory_block_by_id(unsigned long block_id, - struct memory_block *hint) +/* A reference for the returned memory block device is acquired. */ +static struct memory_block *find_memory_block_by_id(unsigned long block_id) { - struct device *hintdev = hint ? &hint->dev : NULL; struct device *dev; - dev = subsys_find_device_by_id(&memory_subsys, block_id, hintdev); - if (hint) - put_device(&hint->dev); - if (!dev) - return NULL; - return to_memory_block(dev); -} - -struct memory_block *find_memory_block_hinted(struct mem_section *section, - struct memory_block *hint) -{ - unsigned long block_id = base_memory_block_id(__section_nr(section)); - - return find_memory_block_by_id(block_id, hint); + dev = subsys_find_device_by_id(&memory_subsys, block_id, NULL); + return dev ? to_memory_block(dev) : NULL; } /* @@ -624,7 +607,9 @@ struct memory_block *find_memory_block_hinted(struct mem_section *section, */ struct memory_block *find_memory_block(struct mem_section *section) { - return find_memory_block_hinted(section, NULL); + unsigned long block_id = base_memory_block_id(__section_nr(section)); + + return find_memory_block_by_id(block_id); } static struct attribute *memory_memblk_attrs[] = { @@ -675,7 +660,7 @@ static int init_memory_block(struct memory_block **memory, unsigned long start_pfn; int ret = 0; - mem = find_memory_block_by_id(block_id, NULL); + mem = find_memory_block_by_id(block_id); if (mem) { put_device(&mem->dev); return -EEXIST; @@ -755,7 +740,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size) end_block_id = block_id; for (block_id = start_block_id; block_id != end_block_id; block_id++) { - mem = find_memory_block_by_id(block_id, NULL); + mem = find_memory_block_by_id(block_id); mem->section_count = 0; unregister_memory(mem); } @@ -782,7 +767,7 @@ void remove_memory_block_devices(unsigned long start, unsigned long size) mutex_lock(&mem_sysfs_mutex); for (block_id = start_block_id; block_id != end_block_id; block_id++) { - mem = find_memory_block_by_id(block_id, NULL); + mem = find_memory_block_by_id(block_id); if (WARN_ON_ONCE(!mem)) continue; mem->section_count = 0; @@ -881,8 +866,11 @@ int walk_memory_blocks(unsigned long start, unsigned long size, unsigned long block_id; int ret = 0; + if (!size) + return 0; + for (block_id = start_block_id; block_id <= end_block_id; block_id++) { - mem = find_memory_block_by_id(block_id, NULL); + mem = find_memory_block_by_id(block_id); if (!mem) continue; diff --git a/include/linux/memory.h b/include/linux/memory.h index b3b388775a30..02e633f3ede0 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -116,8 +116,6 @@ void remove_memory_block_devices(unsigned long start, unsigned long size); extern int memory_dev_init(void); extern int memory_notify(unsigned long val, void *v); extern int memory_isolate_notify(unsigned long val, void *v); -extern struct memory_block *find_memory_block_hinted(struct mem_section *, - struct memory_block *); extern struct memory_block *find_memory_block(struct mem_section *); typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *); extern int walk_memory_blocks(unsigned long start, unsigned long size, From f1eca35a0dc7cb3cdb00c88c8c5e5138a65face0 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 18 Jul 2019 15:57:57 -0700 Subject: [PATCH 23/38] mm/sparsemem: introduce struct mem_section_usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: Sub-section memory hotplug support", v10. The memory hotplug section is an arbitrary / convenient unit for memory hotplug. 'Section-size' units have bled into the user interface ('memblock' sysfs) and can not be changed without breaking existing userspace. The section-size constraint, while mostly benign for typical memory hotplug, has and continues to wreak havoc with 'device-memory' use cases, persistent memory (pmem) in particular. Recall that pmem uses devm_memremap_pages(), and subsequently arch_add_memory(), to allocate a 'struct page' memmap for pmem. However, it does not use the 'bottom half' of memory hotplug, i.e. never marks pmem pages online and never exposes the userspace memblock interface for pmem. This leaves an opening to redress the section-size constraint. To date, the libnvdimm subsystem has attempted to inject padding to satisfy the internal constraints of arch_add_memory(). Beyond complicating the code, leading to bugs [2], wasting memory, and limiting configuration flexibility, the padding hack is broken when the platform changes this physical memory alignment of pmem from one boot to the next. Device failure (intermittent or permanent) and physical reconfiguration are events that can cause the platform firmware to change the physical placement of pmem on a subsequent boot, and device failure is an everyday event in a data-center. It turns out that sections are only a hard requirement of the user-facing interface for memory hotplug and with a bit more infrastructure sub-section arch_add_memory() support can be added for kernel internal usages like devm_memremap_pages(). Here is an analysis of the current design assumptions in the current code and how they are addressed in the new implementation: Current design assumptions: - Sections that describe boot memory (early sections) are never unplugged / removed. - pfn_valid(), in the CONFIG_SPARSEMEM_VMEMMAP=y, case devolves to a valid_section() check - __add_pages() and helper routines assume all operations occur in PAGES_PER_SECTION units. - The memblock sysfs interface only comprehends full sections New design assumptions: - Sections are instrumented with a sub-section bitmask to track (on x86) individual 2MB sub-divisions of a 128MB section. - Partially populated early sections can be extended with additional sub-sections, and those sub-sections can be removed with arch_remove_memory(). With this in place we no longer lose usable memory capacity to padding. - pfn_valid() is updated to look deeper than valid_section() to also check the active-sub-section mask. This indication is in the same cacheline as the valid_section() so the performance impact is expected to be negligible. So far the lkp robot has not reported any regressions. - Outside of the core vmemmap population routines which are replaced, other helper routines like shrink_{zone,pgdat}_span() are updated to handle the smaller granularity. Core memory hotplug routines that deal with online memory are not touched. - The existing memblock sysfs user api guarantees / assumptions are not touched since this capability is limited to !online !memblock-sysfs-accessible sections. Meanwhile the issue reports continue to roll in from users that do not understand when and how the 128MB constraint will bite them. The current implementation relied on being able to support at least one misaligned namespace, but that immediately falls over on any moderately complex namespace creation attempt. Beyond the initial problem of 'System RAM' colliding with pmem, and the unsolvable problem of physical alignment changes, Linux is now being exposed to platforms that collide pmem ranges with other pmem ranges by default [3]. In short, devm_memremap_pages() has pushed the venerable section-size constraint past the breaking point, and the simplicity of section-aligned arch_add_memory() is no longer tenable. These patches are exposed to the kbuild robot on a subsection-v10 branch [4], and a preview of the unit test for this functionality is available on the 'subsection-pending' branch of ndctl [5]. [2]: https://lore.kernel.org/r/155000671719.348031.2347363160141119237.stgit@dwillia2-desk3.amr.corp.intel.com [3]: https://github.com/pmem/ndctl/issues/76 [4]: https://git.kernel.org/pub/scm/linux/kernel/git/djbw/nvdimm.git/log/?h=subsection-v10 [5]: https://github.com/pmem/ndctl/commit/7c59b4867e1c This patch (of 13): Towards enabling memory hotplug to track partial population of a section, introduce 'struct mem_section_usage'. A pointer to a 'struct mem_section_usage' instance replaces the existing pointer to a 'pageblock_flags' bitmap. Effectively it adds one more 'unsigned long' beyond the 'pageblock_flags' (usemap) allocation to house a new 'subsection_map' bitmap. The new bitmap enables the memory hot{plug,remove} implementation to act on incremental sub-divisions of a section. SUBSECTION_SHIFT is defined as global constant instead of per-architecture value like SECTION_SIZE_BITS in order to allow cross-arch compatibility of subsection users. Specifically a common subsection size allows for the possibility that persistent memory namespace configurations be made compatible across architectures. The primary motivation for this functionality is to support platforms that mix "System RAM" and "Persistent Memory" within a single section, or multiple PMEM ranges with different mapping lifetimes within a single section. The section restriction for hotplug has caused an ongoing saga of hacks and bugs for devm_memremap_pages() users. Beyond the fixups to teach existing paths how to retrieve the 'usemap' from a section, and updates to usemap allocation path, there are no expected behavior changes. Link: http://lkml.kernel.org/r/156092349845.979959.73333291612799019.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reviewed-by: Oscar Salvador Reviewed-by: Wei Yang Tested-by: Aneesh Kumar K.V [ppc64] Cc: Michal Hocko Cc: Vlastimil Babka Cc: Logan Gunthorpe Cc: Pavel Tatashin Cc: David Hildenbrand Cc: Jérôme Glisse Cc: Mike Rapoport Cc: Jane Chu Cc: Pavel Tatashin Cc: Jonathan Corbet Cc: Qian Cai Cc: Logan Gunthorpe Cc: Toshi Kani Cc: Jeff Moyer Cc: Michal Hocko Cc: Vlastimil Babka Cc: Jason Gunthorpe Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 28 +++++++++++++-- mm/memory_hotplug.c | 18 +++++----- mm/page_alloc.c | 2 +- mm/sparse.c | 81 +++++++++++++++++++++--------------------- 4 files changed, 76 insertions(+), 53 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 298d1c3e4c2e..2520336bdfd1 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1160,6 +1160,24 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec) #define SECTION_ALIGN_UP(pfn) (((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK) #define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK) +#define SUBSECTION_SHIFT 21 + +#define PFN_SUBSECTION_SHIFT (SUBSECTION_SHIFT - PAGE_SHIFT) +#define PAGES_PER_SUBSECTION (1UL << PFN_SUBSECTION_SHIFT) +#define PAGE_SUBSECTION_MASK (~(PAGES_PER_SUBSECTION-1)) + +#if SUBSECTION_SHIFT > SECTION_SIZE_BITS +#error Subsection size exceeds section size +#else +#define SUBSECTIONS_PER_SECTION (1UL << (SECTION_SIZE_BITS - SUBSECTION_SHIFT)) +#endif + +struct mem_section_usage { + DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION); + /* See declaration of similar field in struct zone */ + unsigned long pageblock_flags[0]; +}; + struct page; struct page_ext; struct mem_section { @@ -1177,8 +1195,7 @@ struct mem_section { */ unsigned long section_mem_map; - /* See declaration of similar field in struct zone */ - unsigned long *pageblock_flags; + struct mem_section_usage *usage; #ifdef CONFIG_PAGE_EXTENSION /* * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use @@ -1209,6 +1226,11 @@ extern struct mem_section **mem_section; extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]; #endif +static inline unsigned long *section_to_usemap(struct mem_section *ms) +{ + return ms->usage->pageblock_flags; +} + static inline struct mem_section *__nr_to_section(unsigned long nr) { #ifdef CONFIG_SPARSEMEM_EXTREME @@ -1220,7 +1242,7 @@ static inline struct mem_section *__nr_to_section(unsigned long nr) return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; } extern unsigned long __section_nr(struct mem_section *ms); -extern unsigned long usemap_size(void); +extern size_t mem_section_usage_size(void); /* * We use the lower bits of the mem_map pointer to store diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fafee5f13ef2..cf9d979a6498 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -166,9 +166,10 @@ void put_page_bootmem(struct page *page) #ifndef CONFIG_SPARSEMEM_VMEMMAP static void register_page_bootmem_info_section(unsigned long start_pfn) { - unsigned long *usemap, mapsize, section_nr, i; + unsigned long mapsize, section_nr, i; struct mem_section *ms; struct page *page, *memmap; + struct mem_section_usage *usage; section_nr = pfn_to_section_nr(start_pfn); ms = __nr_to_section(section_nr); @@ -188,10 +189,10 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) for (i = 0; i < mapsize; i++, page++) get_page_bootmem(section_nr, page, SECTION_INFO); - usemap = ms->pageblock_flags; - page = virt_to_page(usemap); + usage = ms->usage; + page = virt_to_page(usage); - mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; + mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT; for (i = 0; i < mapsize; i++, page++) get_page_bootmem(section_nr, page, MIX_SECTION_INFO); @@ -200,9 +201,10 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) #else /* CONFIG_SPARSEMEM_VMEMMAP */ static void register_page_bootmem_info_section(unsigned long start_pfn) { - unsigned long *usemap, mapsize, section_nr, i; + unsigned long mapsize, section_nr, i; struct mem_section *ms; struct page *page, *memmap; + struct mem_section_usage *usage; section_nr = pfn_to_section_nr(start_pfn); ms = __nr_to_section(section_nr); @@ -211,10 +213,10 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); - usemap = ms->pageblock_flags; - page = virt_to_page(usemap); + usage = ms->usage; + page = virt_to_page(usage); - mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; + mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT; for (i = 0; i < mapsize; i++, page++) get_page_bootmem(section_nr, page, MIX_SECTION_INFO); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e515bfcf7f28..be78bafbfe3a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -450,7 +450,7 @@ static inline unsigned long *get_pageblock_bitmap(struct page *page, unsigned long pfn) { #ifdef CONFIG_SPARSEMEM - return __pfn_to_section(pfn)->pageblock_flags; + return section_to_usemap(__pfn_to_section(pfn)); #else return page_zone(page)->pageblock_flags; #endif /* CONFIG_SPARSEMEM */ diff --git a/mm/sparse.c b/mm/sparse.c index b29534cea8c0..41bef8e1f65c 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -288,33 +288,31 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn static void __meminit sparse_init_one_section(struct mem_section *ms, unsigned long pnum, struct page *mem_map, - unsigned long *pageblock_bitmap) + struct mem_section_usage *usage) { ms->section_mem_map &= ~SECTION_MAP_MASK; ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) | SECTION_HAS_MEM_MAP; - ms->pageblock_flags = pageblock_bitmap; + ms->usage = usage; } -unsigned long usemap_size(void) +static unsigned long usemap_size(void) { return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long); } -#ifdef CONFIG_MEMORY_HOTPLUG -static unsigned long *__kmalloc_section_usemap(void) +size_t mem_section_usage_size(void) { - return kmalloc(usemap_size(), GFP_KERNEL); + return sizeof(struct mem_section_usage) + usemap_size(); } -#endif /* CONFIG_MEMORY_HOTPLUG */ #ifdef CONFIG_MEMORY_HOTREMOVE -static unsigned long * __init +static struct mem_section_usage * __init sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, unsigned long size) { + struct mem_section_usage *usage; unsigned long goal, limit; - unsigned long *p; int nid; /* * A page may contain usemaps for other sections preventing the @@ -330,15 +328,16 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, limit = goal + (1UL << PA_SECTION_SHIFT); nid = early_pfn_to_nid(goal >> PAGE_SHIFT); again: - p = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid); - if (!p && limit) { + usage = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid); + if (!usage && limit) { limit = 0; goto again; } - return p; + return usage; } -static void __init check_usemap_section_nr(int nid, unsigned long *usemap) +static void __init check_usemap_section_nr(int nid, + struct mem_section_usage *usage) { unsigned long usemap_snr, pgdat_snr; static unsigned long old_usemap_snr; @@ -352,7 +351,7 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) old_pgdat_snr = NR_MEM_SECTIONS; } - usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT); + usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT); pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); if (usemap_snr == pgdat_snr) return; @@ -380,14 +379,15 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) usemap_snr, pgdat_snr, nid); } #else -static unsigned long * __init +static struct mem_section_usage * __init sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, unsigned long size) { return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id); } -static void __init check_usemap_section_nr(int nid, unsigned long *usemap) +static void __init check_usemap_section_nr(int nid, + struct mem_section_usage *usage) { } #endif /* CONFIG_MEMORY_HOTREMOVE */ @@ -474,14 +474,13 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin, unsigned long pnum_end, unsigned long map_count) { - unsigned long pnum, usemap_longs, *usemap; + struct mem_section_usage *usage; + unsigned long pnum; struct page *map; - usemap_longs = BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS); - usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid), - usemap_size() * - map_count); - if (!usemap) { + usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid), + mem_section_usage_size() * map_count); + if (!usage) { pr_err("%s: node[%d] usemap allocation failed", __func__, nid); goto failed; } @@ -497,9 +496,9 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin, pnum_begin = pnum; goto failed; } - check_usemap_section_nr(nid, usemap); - sparse_init_one_section(__nr_to_section(pnum), pnum, map, usemap); - usemap += usemap_longs; + check_usemap_section_nr(nid, usage); + sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage); + usage = (void *) usage + mem_section_usage_size(); } sparse_buffer_fini(); return; @@ -697,9 +696,9 @@ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn, struct vmem_altmap *altmap) { unsigned long section_nr = pfn_to_section_nr(start_pfn); + struct mem_section_usage *usage; struct mem_section *ms; struct page *memmap; - unsigned long *usemap; int ret; /* @@ -713,8 +712,8 @@ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn, memmap = kmalloc_section_memmap(section_nr, nid, altmap); if (!memmap) return -ENOMEM; - usemap = __kmalloc_section_usemap(); - if (!usemap) { + usage = kzalloc(mem_section_usage_size(), GFP_KERNEL); + if (!usage) { __kfree_section_memmap(memmap, altmap); return -ENOMEM; } @@ -733,11 +732,11 @@ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn, set_section_nid(section_nr, nid); section_mark_present(ms); - sparse_init_one_section(ms, section_nr, memmap, usemap); + sparse_init_one_section(ms, section_nr, memmap, usage); out: if (ret < 0) { - kfree(usemap); + kfree(usage); __kfree_section_memmap(memmap, altmap); } return ret; @@ -773,20 +772,20 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) } #endif -static void free_section_usemap(struct page *memmap, unsigned long *usemap, - struct vmem_altmap *altmap) +static void free_section_usage(struct page *memmap, + struct mem_section_usage *usage, struct vmem_altmap *altmap) { - struct page *usemap_page; + struct page *usage_page; - if (!usemap) + if (!usage) return; - usemap_page = virt_to_page(usemap); + usage_page = virt_to_page(usage); /* * Check to see if allocation came from hot-plug-add */ - if (PageSlab(usemap_page) || PageCompound(usemap_page)) { - kfree(usemap); + if (PageSlab(usage_page) || PageCompound(usage_page)) { + kfree(usage); if (memmap) __kfree_section_memmap(memmap, altmap); return; @@ -805,18 +804,18 @@ void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset, struct vmem_altmap *altmap) { struct page *memmap = NULL; - unsigned long *usemap = NULL; + struct mem_section_usage *usage = NULL; if (ms->section_mem_map) { - usemap = ms->pageblock_flags; + usage = ms->usage; memmap = sparse_decode_mem_map(ms->section_mem_map, __section_nr(ms)); ms->section_mem_map = 0; - ms->pageblock_flags = NULL; + ms->usage = NULL; } clear_hwpoisoned_pages(memmap + map_offset, PAGES_PER_SECTION - map_offset); - free_section_usemap(memmap, usemap, altmap); + free_section_usage(memmap, usage, altmap); } #endif /* CONFIG_MEMORY_HOTPLUG */ From 326e1b8f83a4318b09033ef754f40c785aed5e68 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 18 Jul 2019 15:58:00 -0700 Subject: [PATCH 24/38] mm/sparsemem: introduce a SECTION_IS_EARLY flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation for sub-section hotplug, track whether a given section was created during early memory initialization, or later via memory hotplug. This distinction is needed to maintain the coarse expectation that pfn_valid() returns true for any pfn within a given section even if that section has pages that are reserved from the page allocator. For example one of the of goals of subsection hotplug is to support cases where the system physical memory layout collides System RAM and PMEM within a section. Several pfn_valid() users expect to just check if a section is valid, but they are not careful to check if the given pfn is within a "System RAM" boundary and instead expect pgdat information to further validate the pfn. Rather than unwind those paths to make their pfn_valid() queries more precise a follow on patch uses the SECTION_IS_EARLY flag to maintain the traditional expectation that pfn_valid() returns true for all early sections. Link: https://lore.kernel.org/lkml/1560366952-10660-1-git-send-email-cai@lca.pw/ Link: http://lkml.kernel.org/r/156092350358.979959.5817209875548072819.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reported-by: Qian Cai Tested-by: Aneesh Kumar K.V [ppc64] Reviewed-by: Oscar Salvador Cc: Michal Hocko Cc: Logan Gunthorpe Cc: David Hildenbrand Cc: Pavel Tatashin Cc: Jane Chu Cc: Jeff Moyer Cc: Jérôme Glisse Cc: Jonathan Corbet Cc: Mike Rapoport Cc: Toshi Kani Cc: Vlastimil Babka Cc: Wei Yang Cc: Jason Gunthorpe Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 8 +++++++- mm/sparse.c | 20 +++++++++----------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2520336bdfd1..4be40634238b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1260,7 +1260,8 @@ extern size_t mem_section_usage_size(void); #define SECTION_MARKED_PRESENT (1UL<<0) #define SECTION_HAS_MEM_MAP (1UL<<1) #define SECTION_IS_ONLINE (1UL<<2) -#define SECTION_MAP_LAST_BIT (1UL<<3) +#define SECTION_IS_EARLY (1UL<<3) +#define SECTION_MAP_LAST_BIT (1UL<<4) #define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1)) #define SECTION_NID_SHIFT 3 @@ -1286,6 +1287,11 @@ static inline int valid_section(struct mem_section *section) return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP)); } +static inline int early_section(struct mem_section *section) +{ + return (section && (section->section_mem_map & SECTION_IS_EARLY)); +} + static inline int valid_section_nr(unsigned long nr) { return valid_section(__nr_to_section(nr)); diff --git a/mm/sparse.c b/mm/sparse.c index 41bef8e1f65c..6d23a526279a 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -288,11 +288,11 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn static void __meminit sparse_init_one_section(struct mem_section *ms, unsigned long pnum, struct page *mem_map, - struct mem_section_usage *usage) + struct mem_section_usage *usage, unsigned long flags) { ms->section_mem_map &= ~SECTION_MAP_MASK; - ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) | - SECTION_HAS_MEM_MAP; + ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) + | SECTION_HAS_MEM_MAP | flags; ms->usage = usage; } @@ -497,7 +497,8 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin, goto failed; } check_usemap_section_nr(nid, usage); - sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage); + sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage, + SECTION_IS_EARLY); usage = (void *) usage + mem_section_usage_size(); } sparse_buffer_fini(); @@ -732,7 +733,7 @@ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn, set_section_nid(section_nr, nid); section_mark_present(ms); - sparse_init_one_section(ms, section_nr, memmap, usage); + sparse_init_one_section(ms, section_nr, memmap, usage, 0); out: if (ret < 0) { @@ -772,19 +773,16 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) } #endif -static void free_section_usage(struct page *memmap, +static void free_section_usage(struct mem_section *ms, struct page *memmap, struct mem_section_usage *usage, struct vmem_altmap *altmap) { - struct page *usage_page; - if (!usage) return; - usage_page = virt_to_page(usage); /* * Check to see if allocation came from hot-plug-add */ - if (PageSlab(usage_page) || PageCompound(usage_page)) { + if (!early_section(ms)) { kfree(usage); if (memmap) __kfree_section_memmap(memmap, altmap); @@ -816,6 +814,6 @@ void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset, clear_hwpoisoned_pages(memmap + map_offset, PAGES_PER_SECTION - map_offset); - free_section_usage(memmap, usage, altmap); + free_section_usage(ms, memmap, usage, altmap); } #endif /* CONFIG_MEMORY_HOTPLUG */ From f46edbd1b1516da1fb34c917775168d5df576f78 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 18 Jul 2019 15:58:04 -0700 Subject: [PATCH 25/38] mm/sparsemem: add helpers track active portions of a section at boot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prepare for hot{plug,remove} of sub-ranges of a section by tracking a sub-section active bitmask, each bit representing a PMD_SIZE span of the architecture's memory hotplug section size. The implications of a partially populated section is that pfn_valid() needs to go beyond a valid_section() check and either determine that the section is an "early section", or read the sub-section active ranges from the bitmask. The expectation is that the bitmask (subsection_map) fits in the same cacheline as the valid_section() / early_section() data, so the incremental performance overhead to pfn_valid() should be negligible. The rationale for using early_section() to short-ciruit the subsection_map check is that there are legacy code paths that use pfn_valid() at section granularity before validating the pfn against pgdat data. So, the early_section() check allows those traditional assumptions to persist while also permitting subsection_map to tell the truth for purposes of populating the unused portions of early sections with PMEM and other ZONE_DEVICE mappings. Link: http://lkml.kernel.org/r/156092350874.979959.18185938451405518285.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reported-by: Qian Cai Tested-by: Jane Chu Tested-by: Aneesh Kumar K.V [ppc64] Reviewed-by: Oscar Salvador Cc: Michal Hocko Cc: Vlastimil Babka Cc: Logan Gunthorpe Cc: Pavel Tatashin Cc: David Hildenbrand Cc: Jeff Moyer Cc: Jérôme Glisse Cc: Jonathan Corbet Cc: Mike Rapoport Cc: Toshi Kani Cc: Wei Yang Cc: Jason Gunthorpe Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 33 ++++++++++++++++++++++++++++++++- mm/page_alloc.c | 10 ++++++++-- mm/sparse.c | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+), 3 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4be40634238b..7747ec9de588 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1178,6 +1178,8 @@ struct mem_section_usage { unsigned long pageblock_flags[0]; }; +void subsection_map_init(unsigned long pfn, unsigned long nr_pages); + struct page; struct page_ext; struct mem_section { @@ -1321,12 +1323,40 @@ static inline struct mem_section *__pfn_to_section(unsigned long pfn) extern unsigned long __highest_present_section_nr; +static inline int subsection_map_index(unsigned long pfn) +{ + return (pfn & ~(PAGE_SECTION_MASK)) / PAGES_PER_SUBSECTION; +} + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) +{ + int idx = subsection_map_index(pfn); + + return test_bit(idx, ms->usage->subsection_map); +} +#else +static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) +{ + return 1; +} +#endif + #ifndef CONFIG_HAVE_ARCH_PFN_VALID static inline int pfn_valid(unsigned long pfn) { + struct mem_section *ms; + if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; - return valid_section(__nr_to_section(pfn_to_section_nr(pfn))); + ms = __nr_to_section(pfn_to_section_nr(pfn)); + if (!valid_section(ms)) + return 0; + /* + * Traditionally early sections always returned pfn_valid() for + * the entire section-sized span. + */ + return early_section(ms) || pfn_section_valid(ms, pfn); } #endif @@ -1358,6 +1388,7 @@ void sparse_init(void); #define sparse_init() do {} while (0) #define sparse_index_init(_sec, _nid) do {} while (0) #define pfn_present pfn_valid +#define subsection_map_init(_pfn, _nr_pages) do {} while (0) #endif /* CONFIG_SPARSEMEM */ /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index be78bafbfe3a..c4cdd3954804 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7351,12 +7351,18 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) (u64)zone_movable_pfn[i] << PAGE_SHIFT); } - /* Print out the early node map */ + /* + * Print out the early node map, and initialize the + * subsection-map relative to active online memory ranges to + * enable future "sub-section" extensions of the memory map. + */ pr_info("Early memory node ranges\n"); - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid, (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1); + subsection_map_init(start_pfn, end_pfn - start_pfn); + } /* Initialise every node */ mminit_verify_pageflags_layout(); diff --git a/mm/sparse.c b/mm/sparse.c index 6d23a526279a..26b48ee1a262 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -210,6 +210,41 @@ static inline unsigned long first_present_section_nr(void) return next_present_section_nr(-1); } +void subsection_mask_set(unsigned long *map, unsigned long pfn, + unsigned long nr_pages) +{ + int idx = subsection_map_index(pfn); + int end = subsection_map_index(pfn + nr_pages - 1); + + bitmap_set(map, idx, end - idx + 1); +} + +void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages) +{ + int end_sec = pfn_to_section_nr(pfn + nr_pages - 1); + int i, start_sec = pfn_to_section_nr(pfn); + + if (!nr_pages) + return; + + for (i = start_sec; i <= end_sec; i++) { + struct mem_section *ms; + unsigned long pfns; + + pfns = min(nr_pages, PAGES_PER_SECTION + - (pfn & ~PAGE_SECTION_MASK)); + ms = __nr_to_section(i); + subsection_mask_set(ms->usage->subsection_map, pfn, pfns); + + pr_debug("%s: sec: %d pfns: %ld set(%d, %d)\n", __func__, i, + pfns, subsection_map_index(pfn), + subsection_map_index(pfn + pfns - 1)); + + pfn += pfns; + nr_pages -= pfns; + } +} + /* Record a memory area against a node. */ void __init memory_present(int nid, unsigned long start, unsigned long end) { From 49ba3c6b37b38b58251c27864f551908c583e99d Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 18 Jul 2019 15:58:07 -0700 Subject: [PATCH 26/38] mm/hotplug: prepare shrink_{zone, pgdat}_span for sub-section removal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sub-section hotplug support reduces the unit of operation of hotplug from section-sized-units (PAGES_PER_SECTION) to sub-section-sized units (PAGES_PER_SUBSECTION). Teach shrink_{zone,pgdat}_span() to consider PAGES_PER_SUBSECTION boundaries as the points where pfn_valid(), not valid_section(), can toggle. [osalvador@suse.de: fix shrink_{zone,node}_span] Link: http://lkml.kernel.org/r/20190717090725.23618-3-osalvador@suse.de Link: http://lkml.kernel.org/r/156092351496.979959.12703722803097017492.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Signed-off-by: Oscar Salvador Reviewed-by: Pavel Tatashin Reviewed-by: Oscar Salvador Tested-by: Aneesh Kumar K.V [ppc64] Cc: Michal Hocko Cc: Vlastimil Babka Cc: Logan Gunthorpe Cc: David Hildenbrand Cc: Jane Chu Cc: Jeff Moyer Cc: Jérôme Glisse Cc: Jonathan Corbet Cc: Mike Rapoport Cc: Toshi Kani Cc: Wei Yang Cc: Jason Gunthorpe Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 37 ++++++++++++------------------------- 1 file changed, 12 insertions(+), 25 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index cf9d979a6498..85467914ad23 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -318,12 +318,8 @@ static unsigned long find_smallest_section_pfn(int nid, struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) { - struct mem_section *ms; - - for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { - ms = __pfn_to_section(start_pfn); - - if (unlikely(!valid_section(ms))) + for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SUBSECTION) { + if (unlikely(!pfn_valid(start_pfn))) continue; if (unlikely(pfn_to_nid(start_pfn) != nid)) @@ -343,15 +339,12 @@ static unsigned long find_biggest_section_pfn(int nid, struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) { - struct mem_section *ms; unsigned long pfn; /* pfn is the end pfn of a memory section. */ pfn = end_pfn - 1; - for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { - ms = __pfn_to_section(pfn); - - if (unlikely(!valid_section(ms))) + for (; pfn >= start_pfn; pfn -= PAGES_PER_SUBSECTION) { + if (unlikely(!pfn_valid(pfn))) continue; if (unlikely(pfn_to_nid(pfn) != nid)) @@ -373,7 +366,6 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */ unsigned long zone_end_pfn = z; unsigned long pfn; - struct mem_section *ms; int nid = zone_to_nid(zone); zone_span_writelock(zone); @@ -410,17 +402,15 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, * it check the zone has only hole or not. */ pfn = zone_start_pfn; - for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { - ms = __pfn_to_section(pfn); - - if (unlikely(!valid_section(ms))) + for (; pfn < zone_end_pfn; pfn += PAGES_PER_SUBSECTION) { + if (unlikely(!pfn_valid(pfn))) continue; if (page_zone(pfn_to_page(pfn)) != zone) continue; - /* If the section is current section, it continues the loop */ - if (start_pfn == pfn) + /* Skip range to be removed */ + if (pfn >= start_pfn && pfn < end_pfn) continue; /* If we find valid section, we have nothing to do */ @@ -441,7 +431,6 @@ static void shrink_pgdat_span(struct pglist_data *pgdat, unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */ unsigned long pgdat_end_pfn = p; unsigned long pfn; - struct mem_section *ms; int nid = pgdat->node_id; if (pgdat_start_pfn == start_pfn) { @@ -478,17 +467,15 @@ static void shrink_pgdat_span(struct pglist_data *pgdat, * has only hole or not. */ pfn = pgdat_start_pfn; - for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) { - ms = __pfn_to_section(pfn); - - if (unlikely(!valid_section(ms))) + for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SUBSECTION) { + if (unlikely(!pfn_valid(pfn))) continue; if (pfn_to_nid(pfn) != nid) continue; - /* If the section is current section, it continues the loop */ - if (start_pfn == pfn) + /* Skip range to be removed */ + if (pfn >= start_pfn && pfn < end_pfn) continue; /* If we find valid section, we have nothing to do */ From e9c0a3f05477e18d2dae816cb61b62be1b7e90d3 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 18 Jul 2019 15:58:11 -0700 Subject: [PATCH 27/38] mm/sparsemem: convert kmalloc_section_memmap() to populate_section_memmap() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow sub-section sized ranges to be added to the memmap. populate_section_memmap() takes an explict pfn range rather than assuming a full section, and those parameters are plumbed all the way through to vmmemap_populate(). There should be no sub-section usage in current deployments. New warnings are added to clarify which memmap allocation paths are sub-section capable. Link: http://lkml.kernel.org/r/156092352058.979959.6551283472062305149.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reviewed-by: Pavel Tatashin Tested-by: Aneesh Kumar K.V [ppc64] Reviewed-by: Oscar Salvador Cc: Michal Hocko Cc: David Hildenbrand Cc: Logan Gunthorpe Cc: Jane Chu Cc: Jeff Moyer Cc: Jérôme Glisse Cc: Jonathan Corbet Cc: Mike Rapoport Cc: Toshi Kani Cc: Vlastimil Babka Cc: Wei Yang Cc: Jason Gunthorpe Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/init_64.c | 4 +++- include/linux/mm.h | 4 ++-- mm/sparse-vmemmap.c | 21 ++++++++++++------ mm/sparse.c | 50 +++++++++++++++++++++++-------------------- 4 files changed, 46 insertions(+), 33 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 5a289a2ab108..a6b5c653727b 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1518,7 +1518,9 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, { int err; - if (boot_cpu_has(X86_FEATURE_PSE)) + if (end - start < PAGES_PER_SECTION * sizeof(struct page)) + err = vmemmap_populate_basepages(start, end, node); + else if (boot_cpu_has(X86_FEATURE_PSE)) err = vmemmap_populate_hugepages(start, end, node, altmap); else if (altmap) { pr_err_once("%s: no cpu support for altmap allocations\n", diff --git a/include/linux/mm.h b/include/linux/mm.h index 48ab7b982d82..0334ca97c584 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2767,8 +2767,8 @@ static inline void print_vma_addr(char *prefix, unsigned long rip) #endif void *sparse_buffer_alloc(unsigned long size); -struct page *sparse_mem_map_populate(unsigned long pnum, int nid, - struct vmem_altmap *altmap); +struct page * __populate_section_memmap(unsigned long pfn, + unsigned long nr_pages, int nid, struct vmem_altmap *altmap); pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node); pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node); diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 7fec05796796..200aef686722 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -245,19 +245,26 @@ int __meminit vmemmap_populate_basepages(unsigned long start, return 0; } -struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid, - struct vmem_altmap *altmap) +struct page * __meminit __populate_section_memmap(unsigned long pfn, + unsigned long nr_pages, int nid, struct vmem_altmap *altmap) { unsigned long start; unsigned long end; - struct page *map; - map = pfn_to_page(pnum * PAGES_PER_SECTION); - start = (unsigned long)map; - end = (unsigned long)(map + PAGES_PER_SECTION); + /* + * The minimum granularity of memmap extensions is + * PAGES_PER_SUBSECTION as allocations are tracked in the + * 'subsection_map' bitmap of the section. + */ + end = ALIGN(pfn + nr_pages, PAGES_PER_SUBSECTION); + pfn &= PAGE_SUBSECTION_MASK; + nr_pages = end - pfn; + + start = (unsigned long) pfn_to_page(pfn); + end = start + nr_pages * sizeof(struct page); if (vmemmap_populate(start, end, nid, altmap)) return NULL; - return map; + return pfn_to_page(pfn); } diff --git a/mm/sparse.c b/mm/sparse.c index 26b48ee1a262..6b01022e23a9 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -439,8 +439,8 @@ static unsigned long __init section_map_size(void) return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); } -struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid, - struct vmem_altmap *altmap) +struct page __init *__populate_section_memmap(unsigned long pfn, + unsigned long nr_pages, int nid, struct vmem_altmap *altmap) { unsigned long size = section_map_size(); struct page *map = sparse_buffer_alloc(size); @@ -521,10 +521,13 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin, } sparse_buffer_init(map_count * section_map_size(), nid); for_each_present_section_nr(pnum_begin, pnum) { + unsigned long pfn = section_nr_to_pfn(pnum); + if (pnum >= pnum_end) break; - map = sparse_mem_map_populate(pnum, nid, NULL); + map = __populate_section_memmap(pfn, PAGES_PER_SECTION, + nid, NULL); if (!map) { pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.", __func__, nid); @@ -625,17 +628,17 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP -static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, - struct vmem_altmap *altmap) +static struct page *populate_section_memmap(unsigned long pfn, + unsigned long nr_pages, int nid, struct vmem_altmap *altmap) { - /* This will make the necessary allocations eventually. */ - return sparse_mem_map_populate(pnum, nid, altmap); + return __populate_section_memmap(pfn, nr_pages, nid, altmap); } -static void __kfree_section_memmap(struct page *memmap, + +static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap) { - unsigned long start = (unsigned long)memmap; - unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION); + unsigned long start = (unsigned long) pfn_to_page(pfn); + unsigned long end = start + nr_pages * sizeof(struct page); vmemmap_free(start, end, altmap); } @@ -647,7 +650,8 @@ static void free_map_bootmem(struct page *memmap) vmemmap_free(start, end, NULL); } #else -static struct page *__kmalloc_section_memmap(void) +struct page *populate_section_memmap(unsigned long pfn, + unsigned long nr_pages, int nid, struct vmem_altmap *altmap) { struct page *page, *ret; unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION; @@ -668,15 +672,11 @@ got_map_ptr: return ret; } -static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, +static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap) { - return __kmalloc_section_memmap(); -} + struct page *memmap = pfn_to_page(pfn); -static void __kfree_section_memmap(struct page *memmap, - struct vmem_altmap *altmap) -{ if (is_vmalloc_addr(memmap)) vfree(memmap); else @@ -745,12 +745,13 @@ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn, if (ret < 0 && ret != -EEXIST) return ret; ret = 0; - memmap = kmalloc_section_memmap(section_nr, nid, altmap); + memmap = populate_section_memmap(start_pfn, PAGES_PER_SECTION, nid, + altmap); if (!memmap) return -ENOMEM; usage = kzalloc(mem_section_usage_size(), GFP_KERNEL); if (!usage) { - __kfree_section_memmap(memmap, altmap); + depopulate_section_memmap(start_pfn, PAGES_PER_SECTION, altmap); return -ENOMEM; } @@ -773,7 +774,7 @@ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn, out: if (ret < 0) { kfree(usage); - __kfree_section_memmap(memmap, altmap); + depopulate_section_memmap(start_pfn, PAGES_PER_SECTION, altmap); } return ret; } @@ -809,7 +810,8 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) #endif static void free_section_usage(struct mem_section *ms, struct page *memmap, - struct mem_section_usage *usage, struct vmem_altmap *altmap) + struct mem_section_usage *usage, unsigned long pfn, + unsigned long nr_pages, struct vmem_altmap *altmap) { if (!usage) return; @@ -820,7 +822,7 @@ static void free_section_usage(struct mem_section *ms, struct page *memmap, if (!early_section(ms)) { kfree(usage); if (memmap) - __kfree_section_memmap(memmap, altmap); + depopulate_section_memmap(pfn, nr_pages, altmap); return; } @@ -849,6 +851,8 @@ void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset, clear_hwpoisoned_pages(memmap + map_offset, PAGES_PER_SECTION - map_offset); - free_section_usage(ms, memmap, usage, altmap); + free_section_usage(ms, memmap, usage, + section_nr_to_pfn(__section_nr(ms)), + PAGES_PER_SECTION, altmap); } #endif /* CONFIG_MEMORY_HOTPLUG */ From 96da4350000973ef9310a10d077d65bbc017f093 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 18 Jul 2019 15:58:15 -0700 Subject: [PATCH 28/38] mm/hotplug: kill is_dev_zone() usage in __remove_pages() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The zone type check was a leftover from the cleanup that plumbed altmap through the memory hotplug path, i.e. commit da024512a1fa "mm: pass the vmem_altmap to arch_remove_memory and __remove_pages". Link: http://lkml.kernel.org/r/156092352642.979959.6664333788149363039.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Tested-by: Aneesh Kumar K.V [ppc64] Cc: Michal Hocko Cc: Logan Gunthorpe Cc: Pavel Tatashin Cc: Jane Chu Cc: Jeff Moyer Cc: Jérôme Glisse Cc: Jonathan Corbet Cc: Mike Rapoport Cc: Toshi Kani Cc: Vlastimil Babka Cc: Wei Yang Cc: Jason Gunthorpe Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 85467914ad23..11220044b01a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -535,9 +535,7 @@ void __remove_pages(struct zone *zone, unsigned long phys_start_pfn, unsigned long map_offset = 0; int sections_to_remove; - /* In the ZONE_DEVICE case device driver owns the memory region */ - if (is_dev_zone(zone)) - map_offset = vmem_altmap_offset(altmap); + map_offset = vmem_altmap_offset(altmap); clear_zone_contiguous(zone); From 46d945aeab4d7dd837bd0724662de2caf712f047 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 18 Jul 2019 15:58:18 -0700 Subject: [PATCH 29/38] mm: kill is_dev_zone() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Given there are no more usages of is_dev_zone() outside of 'ifdef CONFIG_ZONE_DEVICE' protection, kill off the compilation helper. Link: http://lkml.kernel.org/r/156092353211.979959.1489004866360828964.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reviewed-by: Oscar Salvador Reviewed-by: Pavel Tatashin Reviewed-by: Wei Yang Acked-by: David Hildenbrand Tested-by: Aneesh Kumar K.V [ppc64] Cc: Michal Hocko Cc: Logan Gunthorpe Cc: Jane Chu Cc: Jeff Moyer Cc: Jérôme Glisse Cc: Jonathan Corbet Cc: Mike Rapoport Cc: Toshi Kani Cc: Vlastimil Babka Cc: Jason Gunthorpe Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 12 ------------ mm/page_alloc.c | 2 +- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7747ec9de588..8331e76677c0 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -855,18 +855,6 @@ static inline int local_memory_node(int node_id) { return node_id; }; */ #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) -#ifdef CONFIG_ZONE_DEVICE -static inline bool is_dev_zone(const struct zone *zone) -{ - return zone_idx(zone) == ZONE_DEVICE; -} -#else -static inline bool is_dev_zone(const struct zone *zone) -{ - return false; -} -#endif - /* * Returns true if a zone has pages managed by the buddy allocator. * All the reclaim decisions have to use this function rather than diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c4cdd3954804..2c74367a8eba 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5926,7 +5926,7 @@ void __ref memmap_init_zone_device(struct zone *zone, unsigned long start = jiffies; int nid = pgdat->node_id; - if (WARN_ON_ONCE(!pgmap || !is_dev_zone(zone))) + if (WARN_ON_ONCE(!pgmap || zone_idx(zone) != ZONE_DEVICE)) return; /* From 7ea6216049ff9cf250a6722cd766d99c8d1424e5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 18 Jul 2019 15:58:22 -0700 Subject: [PATCH 30/38] mm/sparsemem: prepare for sub-section ranges MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prepare the memory hot-{add,remove} paths for handling sub-section ranges by plumbing the starting page frame and number of pages being handled through arch_{add,remove}_memory() to sparse_{add,remove}_one_section(). This is simply plumbing, small cleanups, and some identifier renames. No intended functional changes. Link: http://lkml.kernel.org/r/156092353780.979959.9713046515562743194.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reviewed-by: Pavel Tatashin Tested-by: Aneesh Kumar K.V [ppc64] Reviewed-by: Oscar Salvador Cc: Michal Hocko Cc: Vlastimil Babka Cc: Logan Gunthorpe Cc: David Hildenbrand Cc: Jane Chu Cc: Jeff Moyer Cc: Jérôme Glisse Cc: Jonathan Corbet Cc: Mike Rapoport Cc: Toshi Kani Cc: Wei Yang Cc: Jason Gunthorpe Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 5 +- mm/memory_hotplug.c | 114 ++++++++++++++++++++------------- mm/sparse.c | 16 ++--- 3 files changed, 81 insertions(+), 54 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 475aff8efbf8..2d636a7491a4 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -346,9 +346,10 @@ extern int add_memory_resource(int nid, struct resource *resource); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap); extern bool is_memblock_offlined(struct memory_block *mem); -extern int sparse_add_one_section(int nid, unsigned long start_pfn, - struct vmem_altmap *altmap); +extern int sparse_add_section(int nid, unsigned long pfn, + unsigned long nr_pages, struct vmem_altmap *altmap); extern void sparse_remove_one_section(struct mem_section *ms, + unsigned long pfn, unsigned long nr_pages, unsigned long map_offset, struct vmem_altmap *altmap); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 11220044b01a..3fbb2cfab126 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -252,51 +252,84 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) } #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ -static int __meminit __add_section(int nid, unsigned long phys_start_pfn, - struct vmem_altmap *altmap) +static int __meminit __add_section(int nid, unsigned long pfn, + unsigned long nr_pages, struct vmem_altmap *altmap) { int ret; - if (pfn_valid(phys_start_pfn)) + if (pfn_valid(pfn)) return -EEXIST; - ret = sparse_add_one_section(nid, phys_start_pfn, altmap); + ret = sparse_add_section(nid, pfn, nr_pages, altmap); return ret < 0 ? ret : 0; } +static int check_pfn_span(unsigned long pfn, unsigned long nr_pages, + const char *reason) +{ + /* + * Disallow all operations smaller than a sub-section and only + * allow operations smaller than a section for + * SPARSEMEM_VMEMMAP. Note that check_hotplug_memory_range() + * enforces a larger memory_block_size_bytes() granularity for + * memory that will be marked online, so this check should only + * fire for direct arch_{add,remove}_memory() users outside of + * add_memory_resource(). + */ + unsigned long min_align; + + if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) + min_align = PAGES_PER_SUBSECTION; + else + min_align = PAGES_PER_SECTION; + if (!IS_ALIGNED(pfn, min_align) + || !IS_ALIGNED(nr_pages, min_align)) { + WARN(1, "Misaligned __%s_pages start: %#lx end: #%lx\n", + reason, pfn, pfn + nr_pages - 1); + return -EINVAL; + } + return 0; +} + /* * Reasonably generic function for adding memory. It is * expected that archs that support memory hotplug will * call this function after deciding the zone to which to * add the new pages. */ -int __ref __add_pages(int nid, unsigned long phys_start_pfn, - unsigned long nr_pages, struct mhp_restrictions *restrictions) +int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, + struct mhp_restrictions *restrictions) { unsigned long i; - int err = 0; - int start_sec, end_sec; + int start_sec, end_sec, err; struct vmem_altmap *altmap = restrictions->altmap; - /* during initialize mem_map, align hot-added range to section */ - start_sec = pfn_to_section_nr(phys_start_pfn); - end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); - if (altmap) { /* * Validate altmap is within bounds of the total request */ - if (altmap->base_pfn != phys_start_pfn + if (altmap->base_pfn != pfn || vmem_altmap_offset(altmap) > nr_pages) { pr_warn_once("memory add fail, invalid altmap\n"); - err = -EINVAL; - goto out; + return -EINVAL; } altmap->alloc = 0; } + err = check_pfn_span(pfn, nr_pages, "add"); + if (err) + return err; + + start_sec = pfn_to_section_nr(pfn); + end_sec = pfn_to_section_nr(pfn + nr_pages - 1); for (i = start_sec; i <= end_sec; i++) { - err = __add_section(nid, section_nr_to_pfn(i), altmap); + unsigned long pfns; + + pfns = min(nr_pages, PAGES_PER_SECTION + - (pfn & ~PAGE_SECTION_MASK)); + err = __add_section(nid, pfn, pfns, altmap); + pfn += pfns; + nr_pages -= pfns; /* * EEXIST is finally dealt with by ioresource collision @@ -309,7 +342,6 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn, cond_resched(); } vmemmap_populate_print_last(); -out: return err; } @@ -487,10 +519,10 @@ static void shrink_pgdat_span(struct pglist_data *pgdat, pgdat->node_spanned_pages = 0; } -static void __remove_zone(struct zone *zone, unsigned long start_pfn) +static void __remove_zone(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages) { struct pglist_data *pgdat = zone->zone_pgdat; - int nr_pages = PAGES_PER_SECTION; unsigned long flags; pgdat_resize_lock(zone->zone_pgdat, &flags); @@ -499,27 +531,23 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn) pgdat_resize_unlock(zone->zone_pgdat, &flags); } -static void __remove_section(struct zone *zone, struct mem_section *ms, - unsigned long map_offset, - struct vmem_altmap *altmap) +static void __remove_section(struct zone *zone, unsigned long pfn, + unsigned long nr_pages, unsigned long map_offset, + struct vmem_altmap *altmap) { - unsigned long start_pfn; - int scn_nr; + struct mem_section *ms = __nr_to_section(pfn_to_section_nr(pfn)); if (WARN_ON_ONCE(!valid_section(ms))) return; - scn_nr = __section_nr(ms); - start_pfn = section_nr_to_pfn((unsigned long)scn_nr); - __remove_zone(zone, start_pfn); - - sparse_remove_one_section(ms, map_offset, altmap); + __remove_zone(zone, pfn, nr_pages); + sparse_remove_one_section(ms, pfn, nr_pages, map_offset, altmap); } /** * __remove_pages() - remove sections of pages from a zone * @zone: zone from which pages need to be removed - * @phys_start_pfn: starting pageframe (must be aligned to start of a section) + * @pfn: starting pageframe (must be aligned to start of a section) * @nr_pages: number of pages to remove (must be multiple of section size) * @altmap: alternative device page map or %NULL if default memmap is used * @@ -528,30 +556,30 @@ static void __remove_section(struct zone *zone, struct mem_section *ms, * sure that pages are marked reserved and zones are adjust properly by * calling offline_pages(). */ -void __remove_pages(struct zone *zone, unsigned long phys_start_pfn, +void __remove_pages(struct zone *zone, unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap) { - unsigned long i; unsigned long map_offset = 0; - int sections_to_remove; + int i, start_sec, end_sec; map_offset = vmem_altmap_offset(altmap); clear_zone_contiguous(zone); - /* - * We can only remove entire sections - */ - BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); - BUG_ON(nr_pages % PAGES_PER_SECTION); + if (check_pfn_span(pfn, nr_pages, "remove")) + return; - sections_to_remove = nr_pages / PAGES_PER_SECTION; - for (i = 0; i < sections_to_remove; i++) { - unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; + start_sec = pfn_to_section_nr(pfn); + end_sec = pfn_to_section_nr(pfn + nr_pages - 1); + for (i = start_sec; i <= end_sec; i++) { + unsigned long pfns; cond_resched(); - __remove_section(zone, __pfn_to_section(pfn), map_offset, - altmap); + pfns = min(nr_pages, PAGES_PER_SECTION + - (pfn & ~PAGE_SECTION_MASK)); + __remove_section(zone, pfn, pfns, map_offset, altmap); + pfn += pfns; + nr_pages -= pfns; map_offset = 0; } diff --git a/mm/sparse.c b/mm/sparse.c index 6b01022e23a9..41579b66fff1 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -728,8 +728,8 @@ static void free_map_bootmem(struct page *memmap) * * -EEXIST - Section has been present. * * -ENOMEM - Out of memory. */ -int __meminit sparse_add_one_section(int nid, unsigned long start_pfn, - struct vmem_altmap *altmap) +int __meminit sparse_add_section(int nid, unsigned long start_pfn, + unsigned long nr_pages, struct vmem_altmap *altmap) { unsigned long section_nr = pfn_to_section_nr(start_pfn); struct mem_section_usage *usage; @@ -835,8 +835,9 @@ static void free_section_usage(struct mem_section *ms, struct page *memmap, free_map_bootmem(memmap); } -void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset, - struct vmem_altmap *altmap) +void sparse_remove_one_section(struct mem_section *ms, unsigned long pfn, + unsigned long nr_pages, unsigned long map_offset, + struct vmem_altmap *altmap) { struct page *memmap = NULL; struct mem_section_usage *usage = NULL; @@ -849,10 +850,7 @@ void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset, ms->usage = NULL; } - clear_hwpoisoned_pages(memmap + map_offset, - PAGES_PER_SECTION - map_offset); - free_section_usage(ms, memmap, usage, - section_nr_to_pfn(__section_nr(ms)), - PAGES_PER_SECTION, altmap); + clear_hwpoisoned_pages(memmap + map_offset, nr_pages - map_offset); + free_section_usage(ms, memmap, usage, pfn, nr_pages, altmap); } #endif /* CONFIG_MEMORY_HOTPLUG */ From ba72b4c8cf60e452cf6f0258ed9ee697957b7dfd Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 18 Jul 2019 15:58:26 -0700 Subject: [PATCH 31/38] mm/sparsemem: support sub-section hotplug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The libnvdimm sub-system has suffered a series of hacks and broken workarounds for the memory-hotplug implementation's awkward section-aligned (128MB) granularity. For example the following backtrace is emitted when attempting arch_add_memory() with physical address ranges that intersect 'System RAM' (RAM) with 'Persistent Memory' (PMEM) within a given section: # cat /proc/iomem | grep -A1 -B1 Persistent\ Memory 100000000-1ffffffff : System RAM 200000000-303ffffff : Persistent Memory (legacy) 304000000-43fffffff : System RAM 440000000-23ffffffff : Persistent Memory 2400000000-43bfffffff : Persistent Memory 2400000000-43bfffffff : namespace2.0 WARNING: CPU: 38 PID: 928 at arch/x86/mm/init_64.c:850 add_pages+0x5c/0x60 [..] RIP: 0010:add_pages+0x5c/0x60 [..] Call Trace: devm_memremap_pages+0x460/0x6e0 pmem_attach_disk+0x29e/0x680 [nd_pmem] ? nd_dax_probe+0xfc/0x120 [libnvdimm] nvdimm_bus_probe+0x66/0x160 [libnvdimm] It was discovered that the problem goes beyond RAM vs PMEM collisions as some platform produce PMEM vs PMEM collisions within a given section. The libnvdimm workaround for that case revealed that the libnvdimm section-alignment-padding implementation has been broken for a long while. A fix for that long-standing breakage introduces as many problems as it solves as it would require a backward-incompatible change to the namespace metadata interpretation. Instead of that dubious route [1], address the root problem in the memory-hotplug implementation. Note that EEXIST is no longer treated as success as that is how sparse_add_section() reports subsection collisions, it was also obviated by recent changes to perform the request_region() for 'System RAM' before arch_add_memory() in the add_memory() sequence. [1] https://lore.kernel.org/r/155000671719.348031.2347363160141119237.stgit@dwillia2-desk3.amr.corp.intel.com [osalvador@suse.de: fix deactivate_section for early sections] Link: http://lkml.kernel.org/r/20190715081549.32577-2-osalvador@suse.de Link: http://lkml.kernel.org/r/156092354368.979959.6232443923440952359.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Signed-off-by: Oscar Salvador Tested-by: Aneesh Kumar K.V [ppc64] Reviewed-by: Oscar Salvador Cc: Michal Hocko Cc: Vlastimil Babka Cc: Logan Gunthorpe Cc: Pavel Tatashin Cc: David Hildenbrand Cc: Jane Chu Cc: Jeff Moyer Cc: Jérôme Glisse Cc: Jonathan Corbet Cc: Mike Rapoport Cc: Toshi Kani Cc: Wei Yang Cc: Jason Gunthorpe Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 2 +- mm/memory_hotplug.c | 27 +---- mm/page_alloc.c | 2 +- mm/sparse.c | 206 +++++++++++++++++++++------------ 4 files changed, 141 insertions(+), 96 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 2d636a7491a4..f46ea71b4ffd 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -348,7 +348,7 @@ extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, extern bool is_memblock_offlined(struct memory_block *mem); extern int sparse_add_section(int nid, unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap); -extern void sparse_remove_one_section(struct mem_section *ms, +extern void sparse_remove_section(struct mem_section *ms, unsigned long pfn, unsigned long nr_pages, unsigned long map_offset, struct vmem_altmap *altmap); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 3fbb2cfab126..aafb71594ee3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -252,18 +252,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) } #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ -static int __meminit __add_section(int nid, unsigned long pfn, - unsigned long nr_pages, struct vmem_altmap *altmap) -{ - int ret; - - if (pfn_valid(pfn)) - return -EEXIST; - - ret = sparse_add_section(nid, pfn, nr_pages, altmap); - return ret < 0 ? ret : 0; -} - static int check_pfn_span(unsigned long pfn, unsigned long nr_pages, const char *reason) { @@ -327,18 +315,11 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, pfns = min(nr_pages, PAGES_PER_SECTION - (pfn & ~PAGE_SECTION_MASK)); - err = __add_section(nid, pfn, pfns, altmap); + err = sparse_add_section(nid, pfn, pfns, altmap); + if (err) + break; pfn += pfns; nr_pages -= pfns; - - /* - * EEXIST is finally dealt with by ioresource collision - * check. see add_memory() => register_memory_resource() - * Warning will be printed if there is collision. - */ - if (err && (err != -EEXIST)) - break; - err = 0; cond_resched(); } vmemmap_populate_print_last(); @@ -541,7 +522,7 @@ static void __remove_section(struct zone *zone, unsigned long pfn, return; __remove_zone(zone, pfn, nr_pages); - sparse_remove_one_section(ms, pfn, nr_pages, map_offset, altmap); + sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap); } /** diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2c74367a8eba..272c6de1bf4e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5974,7 +5974,7 @@ void __ref memmap_init_zone_device(struct zone *zone, * pfn out of zone. * * Please note that MEMMAP_HOTPLUG path doesn't clear memmap - * because this is done early in sparse_add_one_section + * because this is done early in section_activate() */ if (!(pfn & (pageblock_nr_pages - 1))) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); diff --git a/mm/sparse.c b/mm/sparse.c index 41579b66fff1..a205a2ac66a4 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -83,8 +83,15 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid) unsigned long root = SECTION_NR_TO_ROOT(section_nr); struct mem_section *section; + /* + * An existing section is possible in the sub-section hotplug + * case. First hot-add instantiates, follow-on hot-add reuses + * the existing section. + * + * The mem_hotplug_lock resolves the apparent race below. + */ if (mem_section[root]) - return -EEXIST; + return 0; section = sparse_index_alloc(nid); if (!section) @@ -715,10 +722,120 @@ static void free_map_bootmem(struct page *memmap) } #endif /* CONFIG_SPARSEMEM_VMEMMAP */ +static void section_deactivate(unsigned long pfn, unsigned long nr_pages, + struct vmem_altmap *altmap) +{ + DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; + DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 }; + struct mem_section *ms = __pfn_to_section(pfn); + bool section_is_early = early_section(ms); + struct page *memmap = NULL; + unsigned long *subsection_map = ms->usage + ? &ms->usage->subsection_map[0] : NULL; + + subsection_mask_set(map, pfn, nr_pages); + if (subsection_map) + bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION); + + if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION), + "section already deactivated (%#lx + %ld)\n", + pfn, nr_pages)) + return; + + /* + * There are 3 cases to handle across two configurations + * (SPARSEMEM_VMEMMAP={y,n}): + * + * 1/ deactivation of a partial hot-added section (only possible + * in the SPARSEMEM_VMEMMAP=y case). + * a/ section was present at memory init + * b/ section was hot-added post memory init + * 2/ deactivation of a complete hot-added section + * 3/ deactivation of a complete section from memory init + * + * For 1/, when subsection_map does not empty we will not be + * freeing the usage map, but still need to free the vmemmap + * range. + * + * For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified + */ + bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION); + if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) { + unsigned long section_nr = pfn_to_section_nr(pfn); + + if (!section_is_early) { + kfree(ms->usage); + ms->usage = NULL; + } + memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); + ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr); + } + + if (section_is_early && memmap) + free_map_bootmem(memmap); + else + depopulate_section_memmap(pfn, nr_pages, altmap); +} + +static struct page * __meminit section_activate(int nid, unsigned long pfn, + unsigned long nr_pages, struct vmem_altmap *altmap) +{ + DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; + struct mem_section *ms = __pfn_to_section(pfn); + struct mem_section_usage *usage = NULL; + unsigned long *subsection_map; + struct page *memmap; + int rc = 0; + + subsection_mask_set(map, pfn, nr_pages); + + if (!ms->usage) { + usage = kzalloc(mem_section_usage_size(), GFP_KERNEL); + if (!usage) + return ERR_PTR(-ENOMEM); + ms->usage = usage; + } + subsection_map = &ms->usage->subsection_map[0]; + + if (bitmap_empty(map, SUBSECTIONS_PER_SECTION)) + rc = -EINVAL; + else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION)) + rc = -EEXIST; + else + bitmap_or(subsection_map, map, subsection_map, + SUBSECTIONS_PER_SECTION); + + if (rc) { + if (usage) + ms->usage = NULL; + kfree(usage); + return ERR_PTR(rc); + } + + /* + * The early init code does not consider partially populated + * initial sections, it simply assumes that memory will never be + * referenced. If we hot-add memory into such a section then we + * do not need to populate the memmap and can simply reuse what + * is already there. + */ + if (nr_pages < PAGES_PER_SECTION && early_section(ms)) + return pfn_to_page(pfn); + + memmap = populate_section_memmap(pfn, nr_pages, nid, altmap); + if (!memmap) { + section_deactivate(pfn, nr_pages, altmap); + return ERR_PTR(-ENOMEM); + } + + return memmap; +} + /** - * sparse_add_one_section - add a memory section + * sparse_add_section - add a memory section, or populate an existing one * @nid: The node to add section on * @start_pfn: start pfn of the memory range + * @nr_pages: number of pfns to add in the section * @altmap: device page map * * This is only intended for hotplug. @@ -732,51 +849,34 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap) { unsigned long section_nr = pfn_to_section_nr(start_pfn); - struct mem_section_usage *usage; struct mem_section *ms; struct page *memmap; int ret; - /* - * no locking for this, because it does its own - * plus, it does a kmalloc - */ ret = sparse_index_init(section_nr, nid); - if (ret < 0 && ret != -EEXIST) + if (ret < 0) return ret; - ret = 0; - memmap = populate_section_memmap(start_pfn, PAGES_PER_SECTION, nid, - altmap); - if (!memmap) - return -ENOMEM; - usage = kzalloc(mem_section_usage_size(), GFP_KERNEL); - if (!usage) { - depopulate_section_memmap(start_pfn, PAGES_PER_SECTION, altmap); - return -ENOMEM; - } - ms = __pfn_to_section(start_pfn); - if (ms->section_mem_map & SECTION_MARKED_PRESENT) { - ret = -EEXIST; - goto out; - } + memmap = section_activate(nid, start_pfn, nr_pages, altmap); + if (IS_ERR(memmap)) + return PTR_ERR(memmap); /* * Poison uninitialized struct pages in order to catch invalid flags * combinations. */ - page_init_poison(memmap, sizeof(struct page) * PAGES_PER_SECTION); + page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages); + ms = __pfn_to_section(start_pfn); set_section_nid(section_nr, nid); section_mark_present(ms); - sparse_init_one_section(ms, section_nr, memmap, usage, 0); -out: - if (ret < 0) { - kfree(usage); - depopulate_section_memmap(start_pfn, PAGES_PER_SECTION, altmap); - } - return ret; + /* Align memmap to section boundary in the subsection case */ + if (section_nr_to_pfn(section_nr) != start_pfn) + memmap = pfn_to_kaddr(section_nr_to_pfn(section_nr)); + sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0); + + return 0; } #ifdef CONFIG_MEMORY_FAILURE @@ -809,48 +909,12 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) } #endif -static void free_section_usage(struct mem_section *ms, struct page *memmap, - struct mem_section_usage *usage, unsigned long pfn, - unsigned long nr_pages, struct vmem_altmap *altmap) -{ - if (!usage) - return; - - /* - * Check to see if allocation came from hot-plug-add - */ - if (!early_section(ms)) { - kfree(usage); - if (memmap) - depopulate_section_memmap(pfn, nr_pages, altmap); - return; - } - - /* - * The usemap came from bootmem. This is packed with other usemaps - * on the section which has pgdat at boot time. Just keep it as is now. - */ - - if (memmap) - free_map_bootmem(memmap); -} - -void sparse_remove_one_section(struct mem_section *ms, unsigned long pfn, +void sparse_remove_section(struct mem_section *ms, unsigned long pfn, unsigned long nr_pages, unsigned long map_offset, struct vmem_altmap *altmap) { - struct page *memmap = NULL; - struct mem_section_usage *usage = NULL; - - if (ms->section_mem_map) { - usage = ms->usage; - memmap = sparse_decode_mem_map(ms->section_mem_map, - __section_nr(ms)); - ms->section_mem_map = 0; - ms->usage = NULL; - } - - clear_hwpoisoned_pages(memmap + map_offset, nr_pages - map_offset); - free_section_usage(ms, memmap, usage, pfn, nr_pages, altmap); + clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset, + nr_pages - map_offset); + section_deactivate(pfn, nr_pages, altmap); } #endif /* CONFIG_MEMORY_HOTPLUG */ From a0653406a3a671c1609d54835f0443869525ca30 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 18 Jul 2019 15:58:29 -0700 Subject: [PATCH 32/38] mm: document ZONE_DEVICE memory-model implications MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Explain the general mechanisms of 'ZONE_DEVICE' pages and list the users of 'devm_memremap_pages()'. [dan.j.williams@intel.com: update ZONE_DEVICE memory model documentation] Link: http://lkml.kernel.org/r/156109575458.1409767.1885676287099277666.stgit@dwillia2-desk3.amr.corp.intel.com Link: http://lkml.kernel.org/r/156092354985.979959.15763234410543451710.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reported-by: Mike Rapoport Reviewed-by: Mike Rapoport Tested-by: Aneesh Kumar K.V [ppc64] Cc: Jonathan Corbet Cc: David Hildenbrand Cc: Jane Chu Cc: Jeff Moyer Cc: Jérôme Glisse Cc: Logan Gunthorpe Cc: Michal Hocko Cc: Oscar Salvador Cc: Pavel Tatashin Cc: Toshi Kani Cc: Vlastimil Babka Cc: Wei Yang Cc: Jason Gunthorpe Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/memory-model.rst | 40 +++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/Documentation/vm/memory-model.rst b/Documentation/vm/memory-model.rst index 382f72ace1fc..58a12376b7df 100644 --- a/Documentation/vm/memory-model.rst +++ b/Documentation/vm/memory-model.rst @@ -181,3 +181,43 @@ that is eventually passed to vmemmap_populate() through a long chain of function calls. The vmemmap_populate() implementation may use the `vmem_altmap` along with :c:func:`altmap_alloc_block_buf` helper to allocate memory map on the persistent memory device. + +ZONE_DEVICE +=========== +The `ZONE_DEVICE` facility builds upon `SPARSEMEM_VMEMMAP` to offer +`struct page` `mem_map` services for device driver identified physical +address ranges. The "device" aspect of `ZONE_DEVICE` relates to the fact +that the page objects for these address ranges are never marked online, +and that a reference must be taken against the device, not just the page +to keep the memory pinned for active use. `ZONE_DEVICE`, via +:c:func:`devm_memremap_pages`, performs just enough memory hotplug to +turn on :c:func:`pfn_to_page`, :c:func:`page_to_pfn`, and +:c:func:`get_user_pages` service for the given range of pfns. Since the +page reference count never drops below 1 the page is never tracked as +free memory and the page's `struct list_head lru` space is repurposed +for back referencing to the host device / driver that mapped the memory. + +While `SPARSEMEM` presents memory as a collection of sections, +optionally collected into memory blocks, `ZONE_DEVICE` users have a need +for smaller granularity of populating the `mem_map`. Given that +`ZONE_DEVICE` memory is never marked online it is subsequently never +subject to its memory ranges being exposed through the sysfs memory +hotplug api on memory block boundaries. The implementation relies on +this lack of user-api constraint to allow sub-section sized memory +ranges to be specified to :c:func:`arch_add_memory`, the top-half of +memory hotplug. Sub-section support allows for 2MB as the cross-arch +common alignment granularity for :c:func:`devm_memremap_pages`. + +The users of `ZONE_DEVICE` are: + +* pmem: Map platform persistent memory to be used as a direct-I/O target + via DAX mappings. + +* hmm: Extend `ZONE_DEVICE` with `->page_fault()` and `->page_free()` + event callbacks to allow a device-driver to coordinate memory management + events related to device-memory, typically GPU memory. See + Documentation/vm/hmm.rst. + +* p2pdma: Create `struct page` objects to allow peer devices in a + PCI/-E topology to coordinate direct-DMA operations between themselves, + i.e. bypass host memory. From 7cc7867fb06166ac113eda9cf20d3c15d95ff6f5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 18 Jul 2019 15:58:33 -0700 Subject: [PATCH 33/38] mm/devm_memremap_pages: enable sub-section remap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Teach devm_memremap_pages() about the new sub-section capabilities of arch_{add,remove}_memory(). Effectively, just replace all usage of align_start, align_end, and align_size with res->start, res->end, and resource_size(res). The existing sanity check will still make sure that the two separate remap attempts do not collide within a sub-section (2MB on x86). Link: http://lkml.kernel.org/r/156092355542.979959.10060071713397030576.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Tested-by: Aneesh Kumar K.V [ppc64] Cc: Michal Hocko Cc: Toshi Kani Cc: Jérôme Glisse Cc: Logan Gunthorpe Cc: Oscar Salvador Cc: Pavel Tatashin Cc: David Hildenbrand Cc: Jane Chu Cc: Jeff Moyer Cc: Jonathan Corbet Cc: Mike Rapoport Cc: Vlastimil Babka Cc: Wei Yang Cc: Jason Gunthorpe Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 57 +++++++++++++++++++---------------------------- 1 file changed, 23 insertions(+), 34 deletions(-) diff --git a/kernel/memremap.c b/kernel/memremap.c index bea6f887adad..6ee03a816d67 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -54,7 +54,7 @@ static void pgmap_array_delete(struct resource *res) static unsigned long pfn_first(struct dev_pagemap *pgmap) { - return (pgmap->res.start >> PAGE_SHIFT) + + return PHYS_PFN(pgmap->res.start) + vmem_altmap_offset(pgmap_altmap(pgmap)); } @@ -98,7 +98,6 @@ static void devm_memremap_pages_release(void *data) struct dev_pagemap *pgmap = data; struct device *dev = pgmap->dev; struct resource *res = &pgmap->res; - resource_size_t align_start, align_size; unsigned long pfn; int nid; @@ -108,25 +107,21 @@ static void devm_memremap_pages_release(void *data) dev_pagemap_cleanup(pgmap); /* pages are dead and unused, undo the arch mapping */ - align_start = res->start & ~(SECTION_SIZE - 1); - align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) - - align_start; - - nid = page_to_nid(pfn_to_page(align_start >> PAGE_SHIFT)); + nid = page_to_nid(pfn_to_page(PHYS_PFN(res->start))); mem_hotplug_begin(); if (pgmap->type == MEMORY_DEVICE_PRIVATE) { - pfn = align_start >> PAGE_SHIFT; + pfn = PHYS_PFN(res->start); __remove_pages(page_zone(pfn_to_page(pfn)), pfn, - align_size >> PAGE_SHIFT, NULL); + PHYS_PFN(resource_size(res)), NULL); } else { - arch_remove_memory(nid, align_start, align_size, + arch_remove_memory(nid, res->start, resource_size(res), pgmap_altmap(pgmap)); - kasan_remove_zero_shadow(__va(align_start), align_size); + kasan_remove_zero_shadow(__va(res->start), resource_size(res)); } mem_hotplug_done(); - untrack_pfn(NULL, PHYS_PFN(align_start), align_size); + untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res)); pgmap_array_delete(res); dev_WARN_ONCE(dev, pgmap->altmap.alloc, "%s: failed to free all reserved pages\n", __func__); @@ -162,13 +157,12 @@ static void dev_pagemap_percpu_release(struct percpu_ref *ref) */ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) { - resource_size_t align_start, align_size, align_end; struct resource *res = &pgmap->res; struct dev_pagemap *conflict_pgmap; struct mhp_restrictions restrictions = { /* * We do not want any optional features only our own memmap - */ + */ .altmap = pgmap_altmap(pgmap), }; pgprot_t pgprot = PAGE_KERNEL; @@ -225,12 +219,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) return ERR_PTR(error); } - align_start = res->start & ~(SECTION_SIZE - 1); - align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) - - align_start; - align_end = align_start + align_size - 1; - - conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_start), NULL); + conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->start), NULL); if (conflict_pgmap) { dev_WARN(dev, "Conflicting mapping in same section\n"); put_dev_pagemap(conflict_pgmap); @@ -238,7 +227,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) goto err_array; } - conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_end), NULL); + conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->end), NULL); if (conflict_pgmap) { dev_WARN(dev, "Conflicting mapping in same section\n"); put_dev_pagemap(conflict_pgmap); @@ -246,7 +235,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) goto err_array; } - is_ram = region_intersects(align_start, align_size, + is_ram = region_intersects(res->start, resource_size(res), IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); if (is_ram != REGION_DISJOINT) { @@ -267,8 +256,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) if (nid < 0) nid = numa_mem_id(); - error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(align_start), 0, - align_size); + error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(res->start), 0, + resource_size(res)); if (error) goto err_pfn_remap; @@ -286,16 +275,16 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) * arch_add_memory(). */ if (pgmap->type == MEMORY_DEVICE_PRIVATE) { - error = add_pages(nid, align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT, &restrictions); + error = add_pages(nid, PHYS_PFN(res->start), + PHYS_PFN(resource_size(res)), &restrictions); } else { - error = kasan_add_zero_shadow(__va(align_start), align_size); + error = kasan_add_zero_shadow(__va(res->start), resource_size(res)); if (error) { mem_hotplug_done(); goto err_kasan; } - error = arch_add_memory(nid, align_start, align_size, + error = arch_add_memory(nid, res->start, resource_size(res), &restrictions); } @@ -303,8 +292,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) struct zone *zone; zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; - move_pfn_range_to_zone(zone, align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT, pgmap_altmap(pgmap)); + move_pfn_range_to_zone(zone, PHYS_PFN(res->start), + PHYS_PFN(resource_size(res)), restrictions.altmap); } mem_hotplug_done(); @@ -316,8 +305,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) * to allow us to do the work while not holding the hotplug lock. */ memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], - align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT, pgmap); + PHYS_PFN(res->start), + PHYS_PFN(resource_size(res)), pgmap); percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap)); error = devm_add_action_or_reset(dev, devm_memremap_pages_release, @@ -328,9 +317,9 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) return __va(res->start); err_add_memory: - kasan_remove_zero_shadow(__va(align_start), align_size); + kasan_remove_zero_shadow(__va(res->start), resource_size(res)); err_kasan: - untrack_pfn(NULL, PHYS_PFN(align_start), align_size); + untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res)); err_pfn_remap: pgmap_array_delete(res); err_array: From 7e3e888dfc138089f4c15a81b418e88f0978f744 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 18 Jul 2019 15:58:36 -0700 Subject: [PATCH 34/38] libnvdimm/pfn: fix fsdax-mode namespace info-block zero-fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At namespace creation time there is the potential for the "expected to be zero" fields of a 'pfn' info-block to be filled with indeterminate data. While the kernel buffer is zeroed on allocation it is immediately overwritten by nd_pfn_validate() filling it with the current contents of the on-media info-block location. For fields like, 'flags' and the 'padding' it potentially means that future implementations can not rely on those fields being zero. In preparation to stop using the 'start_pad' and 'end_trunc' fields for section alignment, arrange for fields that are not explicitly initialized to be guaranteed zero. Bump the minor version to indicate it is safe to assume the 'padding' and 'flags' are zero. Otherwise, this corruption is expected to benign since all other critical fields are explicitly initialized. Note The cc: stable is about spreading this new policy to as many kernels as possible not fixing an issue in those kernels. It is not until the change titled "libnvdimm/pfn: Stop padding pmem namespaces to section alignment" where this improper initialization becomes a problem. So if someone decides to backport "libnvdimm/pfn: Stop padding pmem namespaces to section alignment" (which is not tagged for stable), make sure this pre-requisite is flagged. Link: http://lkml.kernel.org/r/156092356065.979959.6681003754765958296.stgit@dwillia2-desk3.amr.corp.intel.com Fixes: 32ab0a3f5170 ("libnvdimm, pmem: 'struct page' for pmem") Signed-off-by: Dan Williams Tested-by: Aneesh Kumar K.V [ppc64] Cc: Cc: David Hildenbrand Cc: Jane Chu Cc: Jeff Moyer Cc: Jérôme Glisse Cc: Jonathan Corbet Cc: Logan Gunthorpe Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Pavel Tatashin Cc: Toshi Kani Cc: Vlastimil Babka Cc: Wei Yang Cc: Jason Gunthorpe Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/nvdimm/dax_devs.c | 2 +- drivers/nvdimm/pfn.h | 1 + drivers/nvdimm/pfn_devs.c | 18 +++++++++++++++--- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/drivers/nvdimm/dax_devs.c b/drivers/nvdimm/dax_devs.c index 49fc18ee0565..6d22b0f83b3b 100644 --- a/drivers/nvdimm/dax_devs.c +++ b/drivers/nvdimm/dax_devs.c @@ -118,7 +118,7 @@ int nd_dax_probe(struct device *dev, struct nd_namespace_common *ndns) nvdimm_bus_unlock(&ndns->dev); if (!dax_dev) return -ENOMEM; - pfn_sb = devm_kzalloc(dev, sizeof(*pfn_sb), GFP_KERNEL); + pfn_sb = devm_kmalloc(dev, sizeof(*pfn_sb), GFP_KERNEL); nd_pfn->pfn_sb = pfn_sb; rc = nd_pfn_validate(nd_pfn, DAX_SIG); dev_dbg(dev, "dax: %s\n", rc == 0 ? dev_name(dax_dev) : ""); diff --git a/drivers/nvdimm/pfn.h b/drivers/nvdimm/pfn.h index f58b849e455b..dfb2bcda8f5a 100644 --- a/drivers/nvdimm/pfn.h +++ b/drivers/nvdimm/pfn.h @@ -28,6 +28,7 @@ struct nd_pfn_sb { __le32 end_trunc; /* minor-version-2 record the base alignment of the mapping */ __le32 align; + /* minor-version-3 guarantee the padding and flags are zero */ u8 padding[4000]; __le64 checksum; }; diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 55fb6b7433ed..06f465c0baf3 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -412,6 +412,15 @@ static int nd_pfn_clear_memmap_errors(struct nd_pfn *nd_pfn) return 0; } +/** + * nd_pfn_validate - read and validate info-block + * @nd_pfn: fsdax namespace runtime state / properties + * @sig: 'devdax' or 'fsdax' signature + * + * Upon return the info-block buffer contents (->pfn_sb) are + * indeterminate when validation fails, and a coherent info-block + * otherwise. + */ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig) { u64 checksum, offset; @@ -557,7 +566,7 @@ int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns) nvdimm_bus_unlock(&ndns->dev); if (!pfn_dev) return -ENOMEM; - pfn_sb = devm_kzalloc(dev, sizeof(*pfn_sb), GFP_KERNEL); + pfn_sb = devm_kmalloc(dev, sizeof(*pfn_sb), GFP_KERNEL); nd_pfn = to_nd_pfn(pfn_dev); nd_pfn->pfn_sb = pfn_sb; rc = nd_pfn_validate(nd_pfn, PFN_SIG); @@ -693,7 +702,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) u64 checksum; int rc; - pfn_sb = devm_kzalloc(&nd_pfn->dev, sizeof(*pfn_sb), GFP_KERNEL); + pfn_sb = devm_kmalloc(&nd_pfn->dev, sizeof(*pfn_sb), GFP_KERNEL); if (!pfn_sb) return -ENOMEM; @@ -702,11 +711,14 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) sig = DAX_SIG; else sig = PFN_SIG; + rc = nd_pfn_validate(nd_pfn, sig); if (rc != -ENODEV) return rc; /* no info block, do init */; + memset(pfn_sb, 0, sizeof(*pfn_sb)); + nd_region = to_nd_region(nd_pfn->dev.parent); if (nd_region->ro) { dev_info(&nd_pfn->dev, @@ -759,7 +771,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) memcpy(pfn_sb->uuid, nd_pfn->uuid, 16); memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16); pfn_sb->version_major = cpu_to_le16(1); - pfn_sb->version_minor = cpu_to_le16(2); + pfn_sb->version_minor = cpu_to_le16(3); pfn_sb->start_pad = cpu_to_le32(start_pad); pfn_sb->end_trunc = cpu_to_le32(end_trunc); pfn_sb->align = cpu_to_le32(nd_pfn->align); From a3619190d62ed9d66416891be2416f6bea2b3ca4 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 18 Jul 2019 15:58:40 -0700 Subject: [PATCH 35/38] libnvdimm/pfn: stop padding pmem namespaces to section alignment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that the mm core supports section-unaligned hotplug of ZONE_DEVICE memory, we no longer need to add padding at pfn/dax device creation time. The kernel will still honor padding established by older kernels. Link: http://lkml.kernel.org/r/156092356588.979959.6793371748950931916.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reported-by: Jeff Moyer Tested-by: Aneesh Kumar K.V [ppc64] Cc: David Hildenbrand Cc: Jane Chu Cc: Jérôme Glisse Cc: Jonathan Corbet Cc: Logan Gunthorpe Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Pavel Tatashin Cc: Toshi Kani Cc: Vlastimil Babka Cc: Wei Yang Cc: Jason Gunthorpe Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/nvdimm/pfn.h | 14 ------- drivers/nvdimm/pfn_devs.c | 77 +++++++-------------------------------- include/linux/mmzone.h | 3 ++ 3 files changed, 16 insertions(+), 78 deletions(-) diff --git a/drivers/nvdimm/pfn.h b/drivers/nvdimm/pfn.h index dfb2bcda8f5a..7381673b7b70 100644 --- a/drivers/nvdimm/pfn.h +++ b/drivers/nvdimm/pfn.h @@ -33,18 +33,4 @@ struct nd_pfn_sb { __le64 checksum; }; -#ifdef CONFIG_SPARSEMEM -#define PFN_SECTION_ALIGN_DOWN(x) SECTION_ALIGN_DOWN(x) -#define PFN_SECTION_ALIGN_UP(x) SECTION_ALIGN_UP(x) -#else -/* - * In this case ZONE_DEVICE=n and we will disable 'pfn' device support, - * but we still want pmem to compile. - */ -#define PFN_SECTION_ALIGN_DOWN(x) (x) -#define PFN_SECTION_ALIGN_UP(x) (x) -#endif - -#define PHYS_SECTION_ALIGN_DOWN(x) PFN_PHYS(PFN_SECTION_ALIGN_DOWN(PHYS_PFN(x))) -#define PHYS_SECTION_ALIGN_UP(x) PFN_PHYS(PFN_SECTION_ALIGN_UP(PHYS_PFN(x))) #endif /* __NVDIMM_PFN_H */ diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 06f465c0baf3..df2bdbd22450 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -587,14 +587,14 @@ static u32 info_block_reserve(void) } /* - * We hotplug memory at section granularity, pad the reserved area from - * the previous section base to the namespace base address. + * We hotplug memory at sub-section granularity, pad the reserved area + * from the previous section base to the namespace base address. */ static unsigned long init_altmap_base(resource_size_t base) { unsigned long base_pfn = PHYS_PFN(base); - return PFN_SECTION_ALIGN_DOWN(base_pfn); + return SUBSECTION_ALIGN_DOWN(base_pfn); } static unsigned long init_altmap_reserve(resource_size_t base) @@ -602,7 +602,7 @@ static unsigned long init_altmap_reserve(resource_size_t base) unsigned long reserve = info_block_reserve() >> PAGE_SHIFT; unsigned long base_pfn = PHYS_PFN(base); - reserve += base_pfn - PFN_SECTION_ALIGN_DOWN(base_pfn); + reserve += base_pfn - SUBSECTION_ALIGN_DOWN(base_pfn); return reserve; } @@ -632,8 +632,7 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap) return -EINVAL; nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns); } else if (nd_pfn->mode == PFN_MODE_PMEM) { - nd_pfn->npfns = PFN_SECTION_ALIGN_UP((resource_size(res) - - offset) / PAGE_SIZE); + nd_pfn->npfns = PHYS_PFN((resource_size(res) - offset)); if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns) dev_info(&nd_pfn->dev, "number of pfns truncated from %lld to %ld\n", @@ -649,54 +648,14 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap) return 0; } -static u64 phys_pmem_align_down(struct nd_pfn *nd_pfn, u64 phys) -{ - return min_t(u64, PHYS_SECTION_ALIGN_DOWN(phys), - ALIGN_DOWN(phys, nd_pfn->align)); -} - -/* - * Check if pmem collides with 'System RAM', or other regions when - * section aligned. Trim it accordingly. - */ -static void trim_pfn_device(struct nd_pfn *nd_pfn, u32 *start_pad, u32 *end_trunc) -{ - struct nd_namespace_common *ndns = nd_pfn->ndns; - struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); - struct nd_region *nd_region = to_nd_region(nd_pfn->dev.parent); - const resource_size_t start = nsio->res.start; - const resource_size_t end = start + resource_size(&nsio->res); - resource_size_t adjust, size; - - *start_pad = 0; - *end_trunc = 0; - - adjust = start - PHYS_SECTION_ALIGN_DOWN(start); - size = resource_size(&nsio->res) + adjust; - if (region_intersects(start - adjust, size, IORESOURCE_SYSTEM_RAM, - IORES_DESC_NONE) == REGION_MIXED - || nd_region_conflict(nd_region, start - adjust, size)) - *start_pad = PHYS_SECTION_ALIGN_UP(start) - start; - - /* Now check that end of the range does not collide. */ - adjust = PHYS_SECTION_ALIGN_UP(end) - end; - size = resource_size(&nsio->res) + adjust; - if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM, - IORES_DESC_NONE) == REGION_MIXED - || !IS_ALIGNED(end, nd_pfn->align) - || nd_region_conflict(nd_region, start, size)) - *end_trunc = end - phys_pmem_align_down(nd_pfn, end); -} - static int nd_pfn_init(struct nd_pfn *nd_pfn) { struct nd_namespace_common *ndns = nd_pfn->ndns; struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); - u32 start_pad, end_trunc, reserve = info_block_reserve(); resource_size_t start, size; struct nd_region *nd_region; + unsigned long npfns, align; struct nd_pfn_sb *pfn_sb; - unsigned long npfns; phys_addr_t offset; const char *sig; u64 checksum; @@ -727,43 +686,35 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) return -ENXIO; } - memset(pfn_sb, 0, sizeof(*pfn_sb)); - - trim_pfn_device(nd_pfn, &start_pad, &end_trunc); - if (start_pad + end_trunc) - dev_info(&nd_pfn->dev, "%s alignment collision, truncate %d bytes\n", - dev_name(&ndns->dev), start_pad + end_trunc); - /* * Note, we use 64 here for the standard size of struct page, * debugging options may cause it to be larger in which case the * implementation will limit the pfns advertised through * ->direct_access() to those that are included in the memmap. */ - start = nsio->res.start + start_pad; + start = nsio->res.start; size = resource_size(&nsio->res); - npfns = PFN_SECTION_ALIGN_UP((size - start_pad - end_trunc - reserve) - / PAGE_SIZE); + npfns = PHYS_PFN(size - SZ_8K); + align = max(nd_pfn->align, (1UL << SUBSECTION_SHIFT)); if (nd_pfn->mode == PFN_MODE_PMEM) { /* * The altmap should be padded out to the block size used * when populating the vmemmap. This *should* be equal to * PMD_SIZE for most architectures. */ - offset = ALIGN(start + reserve + 64 * npfns, - max(nd_pfn->align, PMD_SIZE)) - start; + offset = ALIGN(start + SZ_8K + 64 * npfns, align) - start; } else if (nd_pfn->mode == PFN_MODE_RAM) - offset = ALIGN(start + reserve, nd_pfn->align) - start; + offset = ALIGN(start + SZ_8K, align) - start; else return -ENXIO; - if (offset + start_pad + end_trunc >= size) { + if (offset >= size) { dev_err(&nd_pfn->dev, "%s unable to satisfy requested alignment\n", dev_name(&ndns->dev)); return -ENXIO; } - npfns = (size - offset - start_pad - end_trunc) / SZ_4K; + npfns = PHYS_PFN(size - offset); pfn_sb->mode = cpu_to_le32(nd_pfn->mode); pfn_sb->dataoff = cpu_to_le64(offset); pfn_sb->npfns = cpu_to_le64(npfns); @@ -772,8 +723,6 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16); pfn_sb->version_major = cpu_to_le16(1); pfn_sb->version_minor = cpu_to_le16(3); - pfn_sb->start_pad = cpu_to_le32(start_pad); - pfn_sb->end_trunc = cpu_to_le32(end_trunc); pfn_sb->align = cpu_to_le32(nd_pfn->align); checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb); pfn_sb->checksum = cpu_to_le64(checksum); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8331e76677c0..d77d717c620c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1160,6 +1160,9 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec) #define SUBSECTIONS_PER_SECTION (1UL << (SECTION_SIZE_BITS - SUBSECTION_SHIFT)) #endif +#define SUBSECTION_ALIGN_UP(pfn) ALIGN((pfn), PAGES_PER_SUBSECTION) +#define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK) + struct mem_section_usage { DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION); /* See declaration of similar field in struct zone */ From 9a845030427c7a2879a7d635cc7c0e5f79ec962d Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 18 Jul 2019 15:58:43 -0700 Subject: [PATCH 36/38] mm/sparsemem: cleanup 'section number' data types David points out that there is a mixture of 'int' and 'unsigned long' usage for section number data types. Update the memory hotplug path to use 'unsigned long' consistently for section numbers. [akpm@linux-foundation.org: fix printk format] Link: http://lkml.kernel.org/r/156107543656.1329419.11505835211949439815.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reported-by: David Hildenbrand Reviewed-by: David Hildenbrand Cc: Michal Hocko Cc: Oscar Salvador Cc: Jason Gunthorpe Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 10 +++++----- mm/sparse.c | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index aafb71594ee3..2a9bbddb0e55 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -288,8 +288,8 @@ static int check_pfn_span(unsigned long pfn, unsigned long nr_pages, int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, struct mhp_restrictions *restrictions) { - unsigned long i; - int start_sec, end_sec, err; + int err; + unsigned long nr, start_sec, end_sec; struct vmem_altmap *altmap = restrictions->altmap; if (altmap) { @@ -310,7 +310,7 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, start_sec = pfn_to_section_nr(pfn); end_sec = pfn_to_section_nr(pfn + nr_pages - 1); - for (i = start_sec; i <= end_sec; i++) { + for (nr = start_sec; nr <= end_sec; nr++) { unsigned long pfns; pfns = min(nr_pages, PAGES_PER_SECTION @@ -541,7 +541,7 @@ void __remove_pages(struct zone *zone, unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap) { unsigned long map_offset = 0; - int i, start_sec, end_sec; + unsigned long nr, start_sec, end_sec; map_offset = vmem_altmap_offset(altmap); @@ -552,7 +552,7 @@ void __remove_pages(struct zone *zone, unsigned long pfn, start_sec = pfn_to_section_nr(pfn); end_sec = pfn_to_section_nr(pfn + nr_pages - 1); - for (i = start_sec; i <= end_sec; i++) { + for (nr = start_sec; nr <= end_sec; nr++) { unsigned long pfns; cond_resched(); diff --git a/mm/sparse.c b/mm/sparse.c index a205a2ac66a4..72f010d9bff5 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -229,21 +229,21 @@ void subsection_mask_set(unsigned long *map, unsigned long pfn, void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages) { int end_sec = pfn_to_section_nr(pfn + nr_pages - 1); - int i, start_sec = pfn_to_section_nr(pfn); + unsigned long nr, start_sec = pfn_to_section_nr(pfn); if (!nr_pages) return; - for (i = start_sec; i <= end_sec; i++) { + for (nr = start_sec; nr <= end_sec; nr++) { struct mem_section *ms; unsigned long pfns; pfns = min(nr_pages, PAGES_PER_SECTION - (pfn & ~PAGE_SECTION_MASK)); - ms = __nr_to_section(i); + ms = __nr_to_section(nr); subsection_mask_set(ms->usage->subsection_map, pfn, pfns); - pr_debug("%s: sec: %d pfns: %ld set(%d, %d)\n", __func__, i, + pr_debug("%s: sec: %lu pfns: %lu set(%d, %d)\n", __func__, nr, pfns, subsection_map_index(pfn), subsection_map_index(pfn + pfns - 1)); From 371096949f0ad3950b06729989bd27de51b8c5f5 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 18 Jul 2019 15:58:46 -0700 Subject: [PATCH 37/38] mm: migrate: remove unused mode argument migrate_page_move_mapping() doesn't use the mode argument. Remove it and update callers accordingly. Link: http://lkml.kernel.org/r/20190508210301.8472-1-keith.busch@intel.com Signed-off-by: Keith Busch Reviewed-by: Zi Yan Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 2 +- fs/f2fs/data.c | 2 +- fs/iomap.c | 2 +- fs/ubifs/file.c | 2 +- include/linux/migrate.h | 3 +-- mm/migrate.c | 7 +++---- 6 files changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 8327db0c8e08..8b3aa2739906 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -425,7 +425,7 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, BUG_ON(PageWriteback(old)); get_page(new); - rc = migrate_page_move_mapping(mapping, new, old, mode, 1); + rc = migrate_page_move_mapping(mapping, new, old, 1); if (rc != MIGRATEPAGE_SUCCESS) { put_page(new); goto out_unlock; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 4eb2f3920140..abbf14e9bd72 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2919,7 +2919,7 @@ int f2fs_migrate_page(struct address_space *mapping, /* one extra reference was held for atomic_write page */ extra_count = atomic_written ? 1 : 0; rc = migrate_page_move_mapping(mapping, newpage, - page, mode, extra_count); + page, extra_count); if (rc != MIGRATEPAGE_SUCCESS) { if (atomic_written) mutex_unlock(&fi->inmem_lock); diff --git a/fs/iomap.c b/fs/iomap.c index 217c3e5a13d6..3e7f16a05653 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -566,7 +566,7 @@ iomap_migrate_page(struct address_space *mapping, struct page *newpage, { int ret; - ret = migrate_page_move_mapping(mapping, newpage, page, mode, 0); + ret = migrate_page_move_mapping(mapping, newpage, page, 0); if (ret != MIGRATEPAGE_SUCCESS) return ret; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index e5f8de62fc51..400970d740bb 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1470,7 +1470,7 @@ static int ubifs_migrate_page(struct address_space *mapping, { int rc; - rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0); + rc = migrate_page_move_mapping(mapping, newpage, page, 0); if (rc != MIGRATEPAGE_SUCCESS) return rc; diff --git a/include/linux/migrate.h b/include/linux/migrate.h index e13d9bf2f9a5..7f04754c7f2b 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -77,8 +77,7 @@ extern void migrate_page_copy(struct page *newpage, struct page *page); extern int migrate_huge_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page); extern int migrate_page_move_mapping(struct address_space *mapping, - struct page *newpage, struct page *page, enum migrate_mode mode, - int extra_count); + struct page *newpage, struct page *page, int extra_count); #else static inline void putback_movable_pages(struct list_head *l) {} diff --git a/mm/migrate.c b/mm/migrate.c index 3445747e229d..8992741f10aa 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -394,8 +394,7 @@ static int expected_page_refs(struct address_space *mapping, struct page *page) * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. */ int migrate_page_move_mapping(struct address_space *mapping, - struct page *newpage, struct page *page, enum migrate_mode mode, - int extra_count) + struct page *newpage, struct page *page, int extra_count) { XA_STATE(xas, &mapping->i_pages, page_index(page)); struct zone *oldzone, *newzone; @@ -681,7 +680,7 @@ int migrate_page(struct address_space *mapping, BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0); + rc = migrate_page_move_mapping(mapping, newpage, page, 0); if (rc != MIGRATEPAGE_SUCCESS) return rc; @@ -780,7 +779,7 @@ recheck_buffers: } } - rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0); + rc = migrate_page_move_mapping(mapping, newpage, page, 0); if (rc != MIGRATEPAGE_SUCCESS) goto unlock_buffers; From eec4844fae7c033a0c1fc1eb3b8517aeb8b6cc49 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Thu, 18 Jul 2019 15:58:50 -0700 Subject: [PATCH 38/38] proc/sysctl: add shared variables for range check In the sysctl code the proc_dointvec_minmax() function is often used to validate the user supplied value between an allowed range. This function uses the extra1 and extra2 members from struct ctl_table as minimum and maximum allowed value. On sysctl handler declaration, in every source file there are some readonly variables containing just an integer which address is assigned to the extra1 and extra2 members, so the sysctl range is enforced. The special values 0, 1 and INT_MAX are very often used as range boundary, leading duplication of variables like zero=0, one=1, int_max=INT_MAX in different source files: $ git grep -E '\.extra[12].*&(zero|one|int_max)' |wc -l 248 Add a const int array containing the most commonly used values, some macros to refer more easily to the correct array member, and use them instead of creating a local one for every object file. This is the bloat-o-meter output comparing the old and new binary compiled with the default Fedora config: # scripts/bloat-o-meter -d vmlinux.o.old vmlinux.o add/remove: 2/2 grow/shrink: 0/2 up/down: 24/-188 (-164) Data old new delta sysctl_vals - 12 +12 __kstrtab_sysctl_vals - 12 +12 max 14 10 -4 int_max 16 - -16 one 68 - -68 zero 128 28 -100 Total: Before=20583249, After=20583085, chg -0.00% [mcroce@redhat.com: tipc: remove two unused variables] Link: http://lkml.kernel.org/r/20190530091952.4108-1-mcroce@redhat.com [akpm@linux-foundation.org: fix net/ipv6/sysctl_net_ipv6.c] [arnd@arndb.de: proc/sysctl: make firmware loader table conditional] Link: http://lkml.kernel.org/r/20190617130014.1713870-1-arnd@arndb.de [akpm@linux-foundation.org: fix fs/eventpoll.c] Link: http://lkml.kernel.org/r/20190430180111.10688-1-mcroce@redhat.com Signed-off-by: Matteo Croce Signed-off-by: Arnd Bergmann Acked-by: Kees Cook Reviewed-by: Aaron Tomlin Cc: Matthew Wilcox Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/appldata/appldata_base.c | 15 +- arch/s390/kernel/topology.c | 6 +- arch/x86/entry/vdso/vdso32-setup.c | 7 +- arch/x86/kernel/itmt.c | 6 +- drivers/base/firmware_loader/fallback_table.c | 13 +- drivers/gpu/drm/i915/i915_perf.c | 8 +- drivers/hv/vmbus_drv.c | 6 +- drivers/tty/tty_ldisc.c | 6 +- drivers/xen/balloon.c | 7 +- fs/eventpoll.c | 4 +- fs/notify/inotify/inotify_user.c | 8 +- fs/proc/proc_sysctl.c | 4 + include/linux/sysctl.h | 7 + ipc/ipc_sysctl.c | 35 ++-- kernel/pid_namespace.c | 3 +- kernel/sysctl.c | 197 +++++++++--------- kernel/ucount.c | 6 +- net/core/neighbour.c | 20 +- net/core/sysctl_net_core.c | 34 ++- net/dccp/sysctl.c | 16 +- net/ipv4/sysctl_net_ipv4.c | 60 +++--- net/ipv6/addrconf.c | 6 +- net/ipv6/route.c | 7 +- net/ipv6/sysctl_net_ipv6.c | 10 +- net/mpls/af_mpls.c | 10 +- net/netfilter/ipvs/ip_vs_ctl.c | 3 +- net/rxrpc/sysctl.c | 9 +- net/sctp/sysctl.c | 35 ++-- net/sunrpc/xprtrdma/transport.c | 3 +- net/tipc/sysctl.c | 6 +- security/keys/sysctl.c | 26 ++- security/loadpin/loadpin.c | 6 +- security/yama/yama_lsm.c | 3 +- 33 files changed, 270 insertions(+), 322 deletions(-) diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c index e4b58240ec53..aa738cad1338 100644 --- a/arch/s390/appldata/appldata_base.c +++ b/arch/s390/appldata/appldata_base.c @@ -220,15 +220,13 @@ appldata_timer_handler(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int timer_active = appldata_timer_active; - int zero = 0; - int one = 1; int rc; struct ctl_table ctl_entry = { .procname = ctl->procname, .data = &timer_active, .maxlen = sizeof(int), - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }; rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos); @@ -255,13 +253,12 @@ appldata_interval_handler(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int interval = appldata_interval; - int one = 1; int rc; struct ctl_table ctl_entry = { .procname = ctl->procname, .data = &interval, .maxlen = sizeof(int), - .extra1 = &one, + .extra1 = SYSCTL_ONE, }; rc = proc_dointvec_minmax(&ctl_entry, write, buffer, lenp, ppos); @@ -289,13 +286,11 @@ appldata_generic_handler(struct ctl_table *ctl, int write, struct list_head *lh; int rc, found; int active; - int zero = 0; - int one = 1; struct ctl_table ctl_entry = { .data = &active, .maxlen = sizeof(int), - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }; found = 0; diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 8964a3f60aad..2db6fb405a9a 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -587,15 +587,13 @@ static int topology_ctl_handler(struct ctl_table *ctl, int write, { int enabled = topology_is_enabled(); int new_mode; - int zero = 0; - int one = 1; int rc; struct ctl_table ctl_entry = { .procname = ctl->procname, .data = &enabled, .maxlen = sizeof(int), - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }; rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos); diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c index 42d4c89f990e..240626e7f55a 100644 --- a/arch/x86/entry/vdso/vdso32-setup.c +++ b/arch/x86/entry/vdso/vdso32-setup.c @@ -65,9 +65,6 @@ subsys_initcall(sysenter_setup); /* Register vsyscall32 into the ABI table */ #include -static const int zero; -static const int one = 1; - static struct ctl_table abi_table2[] = { { .procname = "vsyscall32", @@ -75,8 +72,8 @@ static struct ctl_table abi_table2[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (int *)&zero, - .extra2 = (int *)&one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, {} }; diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index 838cf8a32c49..1cb3ca9bba49 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -65,8 +65,6 @@ static int sched_itmt_update_handler(struct ctl_table *table, int write, return ret; } -static unsigned int zero; -static unsigned int one = 1; static struct ctl_table itmt_kern_table[] = { { .procname = "sched_itmt_enabled", @@ -74,8 +72,8 @@ static struct ctl_table itmt_kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sched_itmt_update_handler, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, {} }; diff --git a/drivers/base/firmware_loader/fallback_table.c b/drivers/base/firmware_loader/fallback_table.c index 776dd69cf5be..ba9d30b28edc 100644 --- a/drivers/base/firmware_loader/fallback_table.c +++ b/drivers/base/firmware_loader/fallback_table.c @@ -16,9 +16,6 @@ * firmware fallback configuration table */ -static unsigned int zero; -static unsigned int one = 1; - struct firmware_fallback_config fw_fallback_config = { .force_sysfs_fallback = IS_ENABLED(CONFIG_FW_LOADER_USER_HELPER_FALLBACK), .loading_timeout = 60, @@ -26,6 +23,7 @@ struct firmware_fallback_config fw_fallback_config = { }; EXPORT_SYMBOL_GPL(fw_fallback_config); +#ifdef CONFIG_SYSCTL struct ctl_table firmware_config_table[] = { { .procname = "force_sysfs_fallback", @@ -33,8 +31,8 @@ struct ctl_table firmware_config_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_douintvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "ignore_sysfs_fallback", @@ -42,9 +40,10 @@ struct ctl_table firmware_config_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_douintvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { } }; EXPORT_SYMBOL_GPL(firmware_config_table); +#endif diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 3d8162d28730..a700c5c3d167 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -274,8 +274,6 @@ #define POLL_PERIOD (NSEC_PER_SEC / POLL_FREQUENCY) /* for sysctl proc_dointvec_minmax of dev.i915.perf_stream_paranoid */ -static int zero; -static int one = 1; static u32 i915_perf_stream_paranoid = true; /* The maximum exponent the hardware accepts is 63 (essentially it selects one @@ -3366,8 +3364,8 @@ static struct ctl_table oa_table[] = { .maxlen = sizeof(i915_perf_stream_paranoid), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "oa_max_sample_rate", @@ -3375,7 +3373,7 @@ static struct ctl_table oa_table[] = { .maxlen = sizeof(i915_oa_max_sample_rate), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &oa_sample_rate_hard_limit, }, {} diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 894da5abdc55..ebd35fc35290 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1197,8 +1197,6 @@ static struct kmsg_dumper hv_kmsg_dumper = { }; static struct ctl_table_header *hv_ctl_table_hdr; -static int zero; -static int one = 1; /* * sysctl option to allow the user to control whether kmsg data should be @@ -1211,8 +1209,8 @@ static struct ctl_table hv_ctl_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE }, {} }; diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c index fde8d4073e74..4c49f53afa3e 100644 --- a/drivers/tty/tty_ldisc.c +++ b/drivers/tty/tty_ldisc.c @@ -855,8 +855,6 @@ void tty_ldisc_deinit(struct tty_struct *tty) tty->ldisc = NULL; } -static int zero; -static int one = 1; static struct ctl_table tty_table[] = { { .procname = "ldisc_autoload", @@ -864,8 +862,8 @@ static struct ctl_table tty_table[] = { .maxlen = sizeof(tty_ldisc_autoload), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { } }; diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index d37dd5bb7a8f..37a36c6b9f93 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -77,9 +77,6 @@ static int xen_hotplug_unpopulated; #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG -static int zero; -static int one = 1; - static struct ctl_table balloon_table[] = { { .procname = "hotplug_unpopulated", @@ -87,8 +84,8 @@ static struct ctl_table balloon_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { } }; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 0f9c073d78d5..d7f1f5011fac 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -291,7 +291,7 @@ static LIST_HEAD(tfile_check_list); #include -static long zero; +static long long_zero; static long long_max = LONG_MAX; struct ctl_table epoll_table[] = { @@ -301,7 +301,7 @@ struct ctl_table epoll_table[] = { .maxlen = sizeof(max_user_watches), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra1 = &zero, + .extra1 = &long_zero, .extra2 = &long_max, }, { } diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index cce8de32779f..0b815178126e 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -45,8 +45,6 @@ struct kmem_cache *inotify_inode_mark_cachep __read_mostly; #include -static int zero; - struct ctl_table inotify_table[] = { { .procname = "max_user_instances", @@ -54,7 +52,7 @@ struct ctl_table inotify_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "max_user_watches", @@ -62,7 +60,7 @@ struct ctl_table inotify_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "max_queued_events", @@ -70,7 +68,7 @@ struct ctl_table inotify_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero + .extra1 = SYSCTL_ZERO }, { } }; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 36ad1b0d6259..d80989b6c344 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -22,6 +22,10 @@ static const struct inode_operations proc_sys_inode_operations; static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; +/* shared constants to be used in various sysctls */ +const int sysctl_vals[] = { 0, 1, INT_MAX }; +EXPORT_SYMBOL(sysctl_vals); + /* Support for permanently empty directories */ struct ctl_table sysctl_mount_point[] = { diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index aadd310769d0..6df477329b76 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -37,6 +37,13 @@ struct ctl_table_root; struct ctl_table_header; struct ctl_dir; +/* Keep the same order as in fs/proc/proc_sysctl.c */ +#define SYSCTL_ZERO ((void *)&sysctl_vals[0]) +#define SYSCTL_ONE ((void *)&sysctl_vals[1]) +#define SYSCTL_INT_MAX ((void *)&sysctl_vals[2]) + +extern const int sysctl_vals[]; + typedef int proc_handler (struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos); diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index 2b14ce8ce73f..affd66537e87 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -113,9 +113,6 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write, #define proc_ipc_sem_dointvec NULL #endif -static int zero; -static int one = 1; -static int int_max = INT_MAX; int ipc_mni = IPCMNI; int ipc_mni_shift = IPCMNI_SHIFT; int ipc_min_cycle = RADIX_TREE_MAP_SIZE; @@ -141,7 +138,7 @@ static struct ctl_table ipc_kern_table[] = { .maxlen = sizeof(init_ipc_ns.shm_ctlmni), .mode = 0644, .proc_handler = proc_ipc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &ipc_mni, }, { @@ -150,8 +147,8 @@ static struct ctl_table ipc_kern_table[] = { .maxlen = sizeof(init_ipc_ns.shm_rmid_forced), .mode = 0644, .proc_handler = proc_ipc_dointvec_minmax_orphans, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "msgmax", @@ -159,8 +156,8 @@ static struct ctl_table ipc_kern_table[] = { .maxlen = sizeof(init_ipc_ns.msg_ctlmax), .mode = 0644, .proc_handler = proc_ipc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &int_max, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "msgmni", @@ -168,7 +165,7 @@ static struct ctl_table ipc_kern_table[] = { .maxlen = sizeof(init_ipc_ns.msg_ctlmni), .mode = 0644, .proc_handler = proc_ipc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &ipc_mni, }, { @@ -177,8 +174,8 @@ static struct ctl_table ipc_kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_ipc_auto_msgmni, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "msgmnb", @@ -186,8 +183,8 @@ static struct ctl_table ipc_kern_table[] = { .maxlen = sizeof(init_ipc_ns.msg_ctlmnb), .mode = 0644, .proc_handler = proc_ipc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &int_max, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "sem", @@ -203,8 +200,8 @@ static struct ctl_table ipc_kern_table[] = { .maxlen = sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id), .mode = 0644, .proc_handler = proc_ipc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &int_max, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "msg_next_id", @@ -212,8 +209,8 @@ static struct ctl_table ipc_kern_table[] = { .maxlen = sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id), .mode = 0644, .proc_handler = proc_ipc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &int_max, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "shm_next_id", @@ -221,8 +218,8 @@ static struct ctl_table ipc_kern_table[] = { .maxlen = sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id), .mode = 0644, .proc_handler = proc_ipc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &int_max, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, #endif {} diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 6d726cef241c..a6a79f85c81a 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -291,14 +291,13 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, } extern int pid_max; -static int zero = 0; static struct ctl_table pid_ns_ctl_table[] = { { .procname = "ns_last_pid", .maxlen = sizeof(int), .mode = 0666, /* permissions are checked in the handler */ .proc_handler = pid_ns_ctl_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &pid_max, }, { } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 43186ccfa139..078950d9605b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -125,9 +125,6 @@ static int sixty = 60; #endif static int __maybe_unused neg_one = -1; - -static int zero; -static int __maybe_unused one = 1; static int __maybe_unused two = 2; static int __maybe_unused four = 4; static unsigned long zero_ul; @@ -385,8 +382,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sysctl_schedstats, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif /* CONFIG_SCHEDSTATS */ #endif /* CONFIG_SMP */ @@ -418,7 +415,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = SYSCTL_ONE, }, { .procname = "numa_balancing", @@ -426,8 +423,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sysctl_numa_balancing, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_SCHED_DEBUG */ @@ -475,8 +472,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif #ifdef CONFIG_CFS_BANDWIDTH @@ -486,7 +483,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = SYSCTL_ONE, }, #endif #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) @@ -496,8 +493,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sched_energy_aware_handler, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif #ifdef CONFIG_PROVE_LOCKING @@ -562,7 +559,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &neg_one, - .extra2 = &one, + .extra2 = SYSCTL_ONE, }, #endif #ifdef CONFIG_LATENCYTOP @@ -696,8 +693,8 @@ static struct ctl_table kern_table[] = { .mode = 0644, /* only handle a transition from default "0" to "1" */ .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - .extra2 = &one, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE, }, #endif #ifdef CONFIG_MODULES @@ -715,8 +712,8 @@ static struct ctl_table kern_table[] = { .mode = 0644, /* only handle a transition from default "0" to "1" */ .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - .extra2 = &one, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE, }, #endif #ifdef CONFIG_UEVENT_HELPER @@ -875,7 +872,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &ten_thousand, }, { @@ -891,8 +888,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax_sysadmin, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "kptr_restrict", @@ -900,7 +897,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax_sysadmin, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &two, }, #endif @@ -925,8 +922,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_watchdog, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "watchdog_thresh", @@ -934,7 +931,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_watchdog_thresh, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &sixty, }, { @@ -943,8 +940,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = NMI_WATCHDOG_SYSCTL_PERM, .proc_handler = proc_nmi_watchdog, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "watchdog_cpumask", @@ -960,8 +957,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_soft_watchdog, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "softlockup_panic", @@ -969,8 +966,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #ifdef CONFIG_SMP { @@ -979,8 +976,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif /* CONFIG_SMP */ #endif @@ -991,8 +988,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #ifdef CONFIG_SMP { @@ -1001,8 +998,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif /* CONFIG_SMP */ #endif @@ -1115,8 +1112,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "hung_task_check_count", @@ -1124,7 +1121,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "hung_task_timeout_secs", @@ -1201,7 +1198,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(sysctl_perf_event_sample_rate), .mode = 0644, .proc_handler = perf_proc_update_handler, - .extra1 = &one, + .extra1 = SYSCTL_ONE, }, { .procname = "perf_cpu_time_max_percent", @@ -1209,7 +1206,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(sysctl_perf_cpu_time_max_percent), .mode = 0644, .proc_handler = perf_cpu_time_max_percent_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &one_hundred, }, { @@ -1218,7 +1215,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(sysctl_perf_event_max_stack), .mode = 0644, .proc_handler = perf_event_max_stack_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &six_hundred_forty_kb, }, { @@ -1227,7 +1224,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack), .mode = 0644, .proc_handler = perf_event_max_stack_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &one_thousand, }, #endif @@ -1237,8 +1234,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) { @@ -1247,8 +1244,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = timer_migration_handler, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif #ifdef CONFIG_BPF_SYSCALL @@ -1259,8 +1256,8 @@ static struct ctl_table kern_table[] = { .mode = 0644, /* only handle a transition from default "0" to "1" */ .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - .extra2 = &one, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE, }, { .procname = "bpf_stats_enabled", @@ -1277,8 +1274,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(sysctl_panic_on_rcu_stall), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif #ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE @@ -1288,8 +1285,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0600, .proc_handler = stack_erasing_sysctl, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif { } @@ -1302,7 +1299,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_overcommit_memory), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &two, }, { @@ -1311,7 +1308,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_panic_on_oom), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &two, }, { @@ -1348,7 +1345,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "dirty_background_ratio", @@ -1356,7 +1353,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(dirty_background_ratio), .mode = 0644, .proc_handler = dirty_background_ratio_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &one_hundred, }, { @@ -1373,7 +1370,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(vm_dirty_ratio), .mode = 0644, .proc_handler = dirty_ratio_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &one_hundred, }, { @@ -1397,7 +1394,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(dirty_expire_interval), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "dirtytime_expire_seconds", @@ -1405,7 +1402,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(dirtytime_expire_interval), .mode = 0644, .proc_handler = dirtytime_interval_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "swappiness", @@ -1413,7 +1410,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(vm_swappiness), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &one_hundred, }, #ifdef CONFIG_HUGETLB_PAGE @@ -1438,8 +1435,8 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = sysctl_vm_numa_stat_handler, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif { @@ -1470,7 +1467,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = drop_caches_sysctl_handler, - .extra1 = &one, + .extra1 = SYSCTL_ONE, .extra2 = &four, }, #ifdef CONFIG_COMPACTION @@ -1496,8 +1493,8 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif /* CONFIG_COMPACTION */ @@ -1507,7 +1504,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(min_free_kbytes), .mode = 0644, .proc_handler = min_free_kbytes_sysctl_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "watermark_boost_factor", @@ -1515,7 +1512,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(watermark_boost_factor), .mode = 0644, .proc_handler = watermark_boost_factor_sysctl_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "watermark_scale_factor", @@ -1523,7 +1520,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(watermark_scale_factor), .mode = 0644, .proc_handler = watermark_scale_factor_sysctl_handler, - .extra1 = &one, + .extra1 = SYSCTL_ONE, .extra2 = &one_thousand, }, { @@ -1532,7 +1529,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(percpu_pagelist_fraction), .mode = 0644, .proc_handler = percpu_pagelist_fraction_sysctl_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, #ifdef CONFIG_MMU { @@ -1541,7 +1538,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_max_map_count), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, #else { @@ -1550,7 +1547,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_nr_trim_pages), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, #endif { @@ -1566,7 +1563,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(block_dump), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "vfs_cache_pressure", @@ -1574,7 +1571,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_vfs_cache_pressure), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT { @@ -1583,7 +1580,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_legacy_va_layout), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, #endif #ifdef CONFIG_NUMA @@ -1593,7 +1590,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(node_reclaim_mode), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "min_unmapped_ratio", @@ -1601,7 +1598,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_min_unmapped_ratio), .mode = 0644, .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &one_hundred, }, { @@ -1610,7 +1607,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_min_slab_ratio), .mode = 0644, .proc_handler = sysctl_min_slab_ratio_sysctl_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &one_hundred, }, #endif @@ -1661,7 +1658,7 @@ static struct ctl_table vm_table[] = { #endif .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, #endif #ifdef CONFIG_HIGHMEM @@ -1671,8 +1668,8 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(vm_highmem_is_dirtyable), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif #ifdef CONFIG_MEMORY_FAILURE @@ -1682,8 +1679,8 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_memory_failure_early_kill), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "memory_failure_recovery", @@ -1691,8 +1688,8 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_memory_failure_recovery), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif { @@ -1738,8 +1735,8 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_unprivileged_userfaultfd), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif { } @@ -1875,8 +1872,8 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "protected_hardlinks", @@ -1884,8 +1881,8 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "protected_fifos", @@ -1893,7 +1890,7 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &two, }, { @@ -1902,7 +1899,7 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &two, }, { @@ -1911,7 +1908,7 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax_coredump, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &two, }, #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) @@ -1948,7 +1945,7 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = SYSCTL_ONE, }, { } }; @@ -1970,8 +1967,8 @@ static struct ctl_table debug_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_kprobes_optimization_handler, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif { } @@ -3395,8 +3392,8 @@ int proc_do_static_key(struct ctl_table *table, int write, .data = &val, .maxlen = sizeof(val), .mode = table->mode, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }; if (write && !capable(CAP_SYS_ADMIN)) diff --git a/kernel/ucount.c b/kernel/ucount.c index feb128c7b5d9..a53cc2b4179c 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -52,16 +52,14 @@ static struct ctl_table_root set_root = { .permissions = set_permissions, }; -static int zero = 0; -static int int_max = INT_MAX; #define UCOUNT_ENTRY(name) \ { \ .procname = name, \ .maxlen = sizeof(int), \ .mode = 0644, \ .proc_handler = proc_dointvec_minmax, \ - .extra1 = &zero, \ - .extra2 = &int_max, \ + .extra1 = SYSCTL_ZERO, \ + .extra2 = SYSCTL_INT_MAX, \ } static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_user_namespaces"), diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 742cea4ce72e..26da97359d5b 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3374,8 +3374,6 @@ void neigh_app_ns(struct neighbour *n) EXPORT_SYMBOL(neigh_app_ns); #ifdef CONFIG_SYSCTL -static int zero; -static int int_max = INT_MAX; static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN); static int proc_unres_qlen(struct ctl_table *ctl, int write, @@ -3384,7 +3382,7 @@ static int proc_unres_qlen(struct ctl_table *ctl, int write, int size, ret; struct ctl_table tmp = *ctl; - tmp.extra1 = &zero; + tmp.extra1 = SYSCTL_ZERO; tmp.extra2 = &unres_qlen_max; tmp.data = &size; @@ -3449,8 +3447,8 @@ static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write, struct ctl_table tmp = *ctl; int ret; - tmp.extra1 = &zero; - tmp.extra2 = &int_max; + tmp.extra1 = SYSCTL_ZERO; + tmp.extra2 = SYSCTL_INT_MAX; ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); @@ -3595,24 +3593,24 @@ static struct neigh_sysctl_table { .procname = "gc_thresh1", .maxlen = sizeof(int), .mode = 0644, - .extra1 = &zero, - .extra2 = &int_max, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, .proc_handler = proc_dointvec_minmax, }, [NEIGH_VAR_GC_THRESH2] = { .procname = "gc_thresh2", .maxlen = sizeof(int), .mode = 0644, - .extra1 = &zero, - .extra2 = &int_max, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, .proc_handler = proc_dointvec_minmax, }, [NEIGH_VAR_GC_THRESH3] = { .procname = "gc_thresh3", .maxlen = sizeof(int), .mode = 0644, - .extra1 = &zero, - .extra2 = &int_max, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, .proc_handler = proc_dointvec_minmax, }, {}, diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index f9204719aeee..8da5b3a54dac 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -22,8 +22,6 @@ #include #include -static int zero = 0; -static int one = 1; static int two __maybe_unused = 2; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; @@ -390,10 +388,10 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax_bpf_enable, # ifdef CONFIG_BPF_JIT_ALWAYS_ON - .extra1 = &one, - .extra2 = &one, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE, # else - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &two, # endif }, @@ -404,7 +402,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec_minmax_bpf_restricted, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &two, }, { @@ -413,8 +411,8 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec_minmax_bpf_restricted, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, # endif { @@ -461,8 +459,8 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE }, #ifdef CONFIG_RPS { @@ -493,7 +491,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "busy_read", @@ -501,7 +499,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, #endif #ifdef CONFIG_NET_SCHED @@ -533,7 +531,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = SYSCTL_ONE, .extra2 = &max_skb_frags, }, { @@ -542,7 +540,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "fb_tunnels_only_for_init_net", @@ -550,8 +548,8 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "devconf_inherit_init_net", @@ -559,7 +557,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &two, }, { @@ -578,7 +576,7 @@ static struct ctl_table netns_core_table[] = { .data = &init_net.core.sysctl_somaxconn, .maxlen = sizeof(int), .mode = 0644, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .proc_handler = proc_dointvec_minmax }, { } diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c index b59040f268a9..ee8d4f5afa72 100644 --- a/net/dccp/sysctl.c +++ b/net/dccp/sysctl.c @@ -16,9 +16,7 @@ #endif /* Boundary values */ -static int zero = 0, - one = 1, - u8_max = 0xFF; +static int u8_max = 0xFF; static unsigned long seqw_min = DCCPF_SEQ_WMIN, seqw_max = 0xFFFFFFFF; /* maximum on 32 bit */ @@ -38,7 +36,7 @@ static struct ctl_table dccp_default_table[] = { .maxlen = sizeof(sysctl_dccp_rx_ccid), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &u8_max, /* RFC 4340, 10. */ }, { @@ -47,7 +45,7 @@ static struct ctl_table dccp_default_table[] = { .maxlen = sizeof(sysctl_dccp_tx_ccid), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &u8_max, /* RFC 4340, 10. */ }, { @@ -56,7 +54,7 @@ static struct ctl_table dccp_default_table[] = { .maxlen = sizeof(sysctl_dccp_request_retries), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = SYSCTL_ONE, .extra2 = &u8_max, }, { @@ -65,7 +63,7 @@ static struct ctl_table dccp_default_table[] = { .maxlen = sizeof(sysctl_dccp_retries1), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &u8_max, }, { @@ -74,7 +72,7 @@ static struct ctl_table dccp_default_table[] = { .maxlen = sizeof(sysctl_dccp_retries2), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &u8_max, }, { @@ -83,7 +81,7 @@ static struct ctl_table dccp_default_table[] = { .maxlen = sizeof(sysctl_dccp_tx_qlen), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "sync_ratelimit", diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 7d66306b5f39..0b980e841927 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -28,8 +28,6 @@ #include #include -static int zero; -static int one = 1; static int two = 2; static int four = 4; static int thousand = 1000; @@ -576,7 +574,7 @@ static struct ctl_table ipv4_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "icmp_msgs_burst", @@ -584,7 +582,7 @@ static struct ctl_table ipv4_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "udp_mem", @@ -674,8 +672,8 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif { @@ -763,8 +761,8 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = ipv4_fwd_update_priority, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "ip_nonlocal_bind", @@ -794,8 +792,8 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif { @@ -864,7 +862,7 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one + .extra1 = SYSCTL_ONE }, #endif { @@ -969,7 +967,7 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &two, }, { @@ -1011,7 +1009,7 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_tfo_blackhole_detect_timeout, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, #ifdef CONFIG_IP_ROUTE_MULTIPATH { @@ -1020,8 +1018,8 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "fib_multipath_hash_policy", @@ -1029,8 +1027,8 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_fib_multipath_hash_policy, - .extra1 = &zero, - .extra2 = &two, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif { @@ -1047,8 +1045,8 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif { @@ -1078,7 +1076,7 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &four, }, { @@ -1222,7 +1220,7 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = SYSCTL_ONE, .extra2 = &gso_max_segs, }, { @@ -1231,7 +1229,7 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &one_day_secs }, { @@ -1240,8 +1238,8 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "tcp_invalid_ratelimit", @@ -1256,7 +1254,7 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &thousand, }, { @@ -1265,7 +1263,7 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &thousand, }, { @@ -1274,7 +1272,7 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(init_net.ipv4.sysctl_tcp_wmem), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = SYSCTL_ONE, }, { .procname = "tcp_rmem", @@ -1282,7 +1280,7 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(init_net.ipv4.sysctl_tcp_rmem), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = SYSCTL_ONE, }, { .procname = "tcp_comp_sack_delay_ns", @@ -1297,7 +1295,7 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &comp_sack_nr_max, }, { @@ -1306,7 +1304,7 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(init_net.ipv4.sysctl_udp_rmem_min), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one + .extra1 = SYSCTL_ONE }, { .procname = "udp_wmem_min", @@ -1314,7 +1312,7 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(init_net.ipv4.sysctl_udp_wmem_min), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one + .extra1 = SYSCTL_ONE }, { } }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 521e3203e83a..dc73888c7859 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6432,8 +6432,6 @@ int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write, } static int minus_one = -1; -static const int zero = 0; -static const int one = 1; static const int two_five_five = 255; static const struct ctl_table addrconf_sysctl[] = { @@ -6450,7 +6448,7 @@ static const struct ctl_table addrconf_sysctl[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&one, + .extra1 = (void *)SYSCTL_ONE, .extra2 = (void *)&two_five_five, }, { @@ -6809,7 +6807,7 @@ static const struct ctl_table addrconf_sysctl[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&zero, + .extra1 = (void *)SYSCTL_ZERO, .extra2 = (void *)&two_five_five, }, { diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 4d2e6b31a8d6..8b0c33fb19a2 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -6031,9 +6031,6 @@ int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, return 0; } -static int zero; -static int one = 1; - static struct ctl_table ipv6_route_table_template[] = { { .procname = "flush", @@ -6111,8 +6108,8 @@ static struct ctl_table ipv6_route_table_template[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { } }; diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index dc4c91e0bfb8..ec8fcfc60a27 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -21,8 +21,6 @@ #include #endif -static int zero; -static int one = 1; static int flowlabel_reflect_max = 0x7; static int auto_flowlabels_min; static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; @@ -115,7 +113,7 @@ static struct ctl_table ipv6_table_template[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &flowlabel_reflect_max, }, { @@ -152,8 +150,8 @@ static struct ctl_table ipv6_table_template[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_rt6_multipath_hash_policy, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "seg6_flowlabel", @@ -179,7 +177,7 @@ static struct ctl_table ipv6_rotable[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one + .extra1 = SYSCTL_ONE }, #ifdef CONFIG_NETLABEL { diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 198ec4fe4148..c312741df2ce 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -37,8 +37,6 @@ #define MPLS_NEIGH_TABLE_UNSPEC (NEIGH_LINK_TABLE + 1) -static int zero = 0; -static int one = 1; static int label_limit = (1 << 20) - 1; static int ttl_max = 255; @@ -2607,7 +2605,7 @@ static int mpls_platform_labels(struct ctl_table *table, int write, .data = &platform_labels, .maxlen = sizeof(int), .mode = table->mode, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &label_limit, }; @@ -2636,8 +2634,8 @@ static const struct ctl_table mpls_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "default_ttl", @@ -2645,7 +2643,7 @@ static const struct ctl_table mpls_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = SYSCTL_ONE, .extra2 = &ttl_max, }, { } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 07e0967bf129..060565e7d227 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1726,7 +1726,6 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs) #ifdef CONFIG_SYSCTL -static int zero; static int three = 3; static int @@ -1935,7 +1934,7 @@ static struct ctl_table vs_vars[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &three, }, { diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index 1e3fa67d91aa..2bbb38161851 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -11,7 +11,6 @@ #include "ar-internal.h" static struct ctl_table_header *rxrpc_sysctl_reg_table; -static const unsigned int one = 1; static const unsigned int four = 4; static const unsigned int thirtytwo = 32; static const unsigned int n_65535 = 65535; @@ -97,7 +96,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&one, + .extra1 = (void *)SYSCTL_ONE, .extra2 = (void *)&rxrpc_max_client_connections, }, { @@ -115,7 +114,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&one, + .extra1 = (void *)SYSCTL_ONE, .extra2 = (void *)&n_max_acks, }, { @@ -124,7 +123,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&one, + .extra1 = (void *)SYSCTL_ONE, .extra2 = (void *)&n_65535, }, { @@ -133,7 +132,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&one, + .extra1 = (void *)SYSCTL_ONE, .extra2 = (void *)&four, }, diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 9a19147902f1..1250751bca1b 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -25,10 +25,7 @@ #include #include -static int zero = 0; -static int one = 1; static int timer_max = 86400000; /* ms in one day */ -static int int_max = INT_MAX; static int sack_timer_min = 1; static int sack_timer_max = 500; static int addr_scope_max = SCTP_SCOPE_POLICY_MAX; @@ -92,7 +89,7 @@ static struct ctl_table sctp_net_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = SYSCTL_ONE, .extra2 = &timer_max }, { @@ -101,7 +98,7 @@ static struct ctl_table sctp_net_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_sctp_do_rto_min, - .extra1 = &one, + .extra1 = SYSCTL_ONE, .extra2 = &init_net.sctp.rto_max }, { @@ -137,8 +134,8 @@ static struct ctl_table sctp_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &int_max + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "cookie_preserve_enable", @@ -160,7 +157,7 @@ static struct ctl_table sctp_net_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = SYSCTL_ONE, .extra2 = &timer_max }, { @@ -178,7 +175,7 @@ static struct ctl_table sctp_net_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = SYSCTL_ONE, .extra2 = &timer_max }, { @@ -187,8 +184,8 @@ static struct ctl_table sctp_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - .extra2 = &int_max + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "path_max_retrans", @@ -196,8 +193,8 @@ static struct ctl_table sctp_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - .extra2 = &int_max + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "max_init_retransmits", @@ -205,8 +202,8 @@ static struct ctl_table sctp_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - .extra2 = &int_max + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "pf_retrans", @@ -214,8 +211,8 @@ static struct ctl_table sctp_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &int_max + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "sndbuf_policy", @@ -286,7 +283,7 @@ static struct ctl_table sctp_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &addr_scope_max, }, { @@ -295,7 +292,7 @@ static struct ctl_table sctp_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec_minmax, - .extra1 = &one, + .extra1 = SYSCTL_ONE, .extra2 = &rwnd_scale_max, }, { diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 1f73a6a7e43c..ffb1684c4573 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -80,7 +80,6 @@ static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE; static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE; static unsigned int min_inline_size = RPCRDMA_MIN_INLINE; static unsigned int max_inline_size = RPCRDMA_MAX_INLINE; -static unsigned int zero; static unsigned int max_padding = PAGE_SIZE; static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS; static unsigned int max_memreg = RPCRDMA_LAST - 1; @@ -122,7 +121,7 @@ static struct ctl_table xr_tunables_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &max_padding, }, { diff --git a/net/tipc/sysctl.c b/net/tipc/sysctl.c index 9df82a573aa7..6159d327db76 100644 --- a/net/tipc/sysctl.c +++ b/net/tipc/sysctl.c @@ -38,8 +38,6 @@ #include -static int zero; -static int one = 1; static struct ctl_table_header *tipc_ctl_hdr; static struct ctl_table tipc_table[] = { @@ -49,7 +47,7 @@ static struct ctl_table tipc_table[] = { .maxlen = sizeof(sysctl_tipc_rmem), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = SYSCTL_ONE, }, { .procname = "named_timeout", @@ -57,7 +55,7 @@ static struct ctl_table tipc_table[] = { .maxlen = sizeof(sysctl_tipc_named_timeout), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "sk_filter", diff --git a/security/keys/sysctl.c b/security/keys/sysctl.c index dd1e21fab827..b46b651b3c4c 100644 --- a/security/keys/sysctl.c +++ b/security/keys/sysctl.c @@ -9,8 +9,6 @@ #include #include "internal.h" -static const int zero, one = 1, max = INT_MAX; - struct ctl_table key_sysctls[] = { { .procname = "maxkeys", @@ -18,8 +16,8 @@ struct ctl_table key_sysctls[] = { .maxlen = sizeof(unsigned), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *) &one, - .extra2 = (void *) &max, + .extra1 = (void *) SYSCTL_ONE, + .extra2 = (void *) SYSCTL_INT_MAX, }, { .procname = "maxbytes", @@ -27,8 +25,8 @@ struct ctl_table key_sysctls[] = { .maxlen = sizeof(unsigned), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *) &one, - .extra2 = (void *) &max, + .extra1 = (void *) SYSCTL_ONE, + .extra2 = (void *) SYSCTL_INT_MAX, }, { .procname = "root_maxkeys", @@ -36,8 +34,8 @@ struct ctl_table key_sysctls[] = { .maxlen = sizeof(unsigned), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *) &one, - .extra2 = (void *) &max, + .extra1 = (void *) SYSCTL_ONE, + .extra2 = (void *) SYSCTL_INT_MAX, }, { .procname = "root_maxbytes", @@ -45,8 +43,8 @@ struct ctl_table key_sysctls[] = { .maxlen = sizeof(unsigned), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *) &one, - .extra2 = (void *) &max, + .extra1 = (void *) SYSCTL_ONE, + .extra2 = (void *) SYSCTL_INT_MAX, }, { .procname = "gc_delay", @@ -54,8 +52,8 @@ struct ctl_table key_sysctls[] = { .maxlen = sizeof(unsigned), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *) &zero, - .extra2 = (void *) &max, + .extra1 = (void *) SYSCTL_ZERO, + .extra2 = (void *) SYSCTL_INT_MAX, }, #ifdef CONFIG_PERSISTENT_KEYRINGS { @@ -64,8 +62,8 @@ struct ctl_table key_sysctls[] = { .maxlen = sizeof(unsigned), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *) &zero, - .extra2 = (void *) &max, + .extra1 = (void *) SYSCTL_ZERO, + .extra2 = (void *) SYSCTL_INT_MAX, }, #endif { } diff --git a/security/loadpin/loadpin.c b/security/loadpin/loadpin.c index 81519c804888..ee5cb944f4ad 100644 --- a/security/loadpin/loadpin.c +++ b/security/loadpin/loadpin.c @@ -43,8 +43,6 @@ static struct super_block *pinned_root; static DEFINE_SPINLOCK(pinned_root_spinlock); #ifdef CONFIG_SYSCTL -static int zero; -static int one = 1; static struct ctl_path loadpin_sysctl_path[] = { { .procname = "kernel", }, @@ -59,8 +57,8 @@ static struct ctl_table loadpin_sysctl_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { } }; diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c index 01c6239c4493..94dc346370b1 100644 --- a/security/yama/yama_lsm.c +++ b/security/yama/yama_lsm.c @@ -445,7 +445,6 @@ static int yama_dointvec_minmax(struct ctl_table *table, int write, return proc_dointvec_minmax(&table_copy, write, buffer, lenp, ppos); } -static int zero; static int max_scope = YAMA_SCOPE_NO_ATTACH; static struct ctl_path yama_sysctl_path[] = { @@ -461,7 +460,7 @@ static struct ctl_table yama_sysctl_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = yama_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &max_scope, }, { }