From c68ed7945701a38f2121ed74e23ff19c2052b4c2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Sep 2021 15:17:57 -0700 Subject: [PATCH 01/18] mm/vmstat: protect per cpu variables with preempt disable on RT Disable preemption on -RT for the vmstat code. On vanila the code runs in IRQ-off regions while on -RT it may not when stats are updated under a local_lock. "preempt_disable" ensures that the same resources is not updated in parallel due to preemption. This patch differs from the preempt-rt version where __count_vm_event and __count_vm_events are also protected. The counters are explicitly "allowed to be to be racy" so there is no need to protect them from preemption. Only the accurate page stats that are updated by a read-modify-write need protection. This patch also differs in that a preempt_[en|dis]able_rt helper is not used. As vmstat is the only user of the helper, it was suggested that it be open-coded in vmstat.c instead of risking the helper being used in unnecessary contexts. Link: https://lkml.kernel.org/r/20210805160019.1137-2-mgorman@techsingularity.net Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/mm/vmstat.c b/mm/vmstat.c index 0885a34197b7..8ce2620344b2 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -319,6 +319,16 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, long x; long t; + /* + * Accurate vmstat updates require a RMW. On !PREEMPT_RT kernels, + * atomicity is provided by IRQs being disabled -- either explicitly + * or via local_lock_irq. On PREEMPT_RT, local_lock_irq only disables + * CPU migrations and preemption potentially corrupts a counter so + * disable preemption. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + x = delta + __this_cpu_read(*p); t = __this_cpu_read(pcp->stat_threshold); @@ -328,6 +338,9 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, x = 0; } __this_cpu_write(*p, x); + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); } EXPORT_SYMBOL(__mod_zone_page_state); @@ -350,6 +363,10 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, delta >>= PAGE_SHIFT; } + /* See __mod_node_page_state */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + x = delta + __this_cpu_read(*p); t = __this_cpu_read(pcp->stat_threshold); @@ -359,6 +376,9 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, x = 0; } __this_cpu_write(*p, x); + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); } EXPORT_SYMBOL(__mod_node_page_state); @@ -391,6 +411,10 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) s8 __percpu *p = pcp->vm_stat_diff + item; s8 v, t; + /* See __mod_node_page_state */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + v = __this_cpu_inc_return(*p); t = __this_cpu_read(pcp->stat_threshold); if (unlikely(v > t)) { @@ -399,6 +423,9 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) zone_page_state_add(v + overstep, zone, item); __this_cpu_write(*p, -overstep); } + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); } void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) @@ -409,6 +436,10 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); + /* See __mod_node_page_state */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + v = __this_cpu_inc_return(*p); t = __this_cpu_read(pcp->stat_threshold); if (unlikely(v > t)) { @@ -417,6 +448,9 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) node_page_state_add(v + overstep, pgdat, item); __this_cpu_write(*p, -overstep); } + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); } void __inc_zone_page_state(struct page *page, enum zone_stat_item item) @@ -437,6 +471,10 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) s8 __percpu *p = pcp->vm_stat_diff + item; s8 v, t; + /* See __mod_node_page_state */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + v = __this_cpu_dec_return(*p); t = __this_cpu_read(pcp->stat_threshold); if (unlikely(v < - t)) { @@ -445,6 +483,9 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) zone_page_state_add(v - overstep, zone, item); __this_cpu_write(*p, overstep); } + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); } void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) @@ -455,6 +496,10 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); + /* See __mod_node_page_state */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + v = __this_cpu_dec_return(*p); t = __this_cpu_read(pcp->stat_threshold); if (unlikely(v < - t)) { @@ -463,6 +508,9 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) node_page_state_add(v - overstep, pgdat, item); __this_cpu_write(*p, overstep); } + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); } void __dec_zone_page_state(struct page *page, enum zone_stat_item item) From 2b9b624f5aef6af608edf541fed973948e27004c Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 8 Sep 2021 15:18:01 -0700 Subject: [PATCH 02/18] mm: migrate: introduce a local variable to get the number of pages Use thp_nr_pages() instead of compound_nr() to get the number of pages for THP page, meanwhile introducing a local variable 'nr_pages' to avoid getting the number of pages repeatedly. Link: https://lkml.kernel.org/r/a8e331ac04392ee230c79186330fb05e86a2aa77.1629447552.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Yang Shi Cc: Alistair Popple Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index a0aeb3fe46a7..d6617f8e546d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2107,6 +2107,7 @@ out: static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) { int page_lru; + int nr_pages = thp_nr_pages(page); VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page); @@ -2115,7 +2116,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) return 0; /* Avoid migrating to a node that is nearly full */ - if (!migrate_balanced_pgdat(pgdat, compound_nr(page))) + if (!migrate_balanced_pgdat(pgdat, nr_pages)) return 0; if (isolate_lru_page(page)) @@ -2123,7 +2124,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) page_lru = page_is_file_lru(page); mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru, - thp_nr_pages(page)); + nr_pages); /* * Isolating the page has taken another reference, so the From 68a9843f14b6b0d1ce023721814403253d8e9153 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 8 Sep 2021 15:18:03 -0700 Subject: [PATCH 03/18] mm: migrate: fix the incorrect function name in comments since commit a98a2f0c8ce1 ("mm/rmap: split migration into its own function"), the migration ptes establishment has been split into a separate try_to_migrate() function, thus update the related comments. Link: https://lkml.kernel.org/r/5b824bad6183259c916ae6cf42f81d14c6118b06.1629447552.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Yang Shi Reviewed-by: Alistair Popple Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/migrate.c b/mm/migrate.c index d6617f8e546d..d5a1d38c993c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1008,7 +1008,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, } /* - * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, + * By try_to_migrate(), page->mapcount goes down to 0 here. In this case, * we cannot notice that anon_vma is freed while we migrates a page. * This get_anon_vma() delays freeing anon_vma pointer until the end * of migration. File cache pages are no problem because of page_lock() From 213ecb3157514486a9ae6848a298b91a79cc2e2a Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 8 Sep 2021 15:18:06 -0700 Subject: [PATCH 04/18] mm: migrate: change to use bool type for 'page_was_mapped' Change to use bool type for 'page_was_mapped' variable making it more readable. Link: https://lkml.kernel.org/r/ce1279df18d2c163998c403e0b5ec6d3f6f90f7a.1629447552.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Yang Shi Cc: Alistair Popple Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index d5a1d38c993c..24c2ff8fe4c0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -960,7 +960,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, int force, enum migrate_mode mode) { int rc = -EAGAIN; - int page_was_mapped = 0; + bool page_was_mapped = false; struct anon_vma *anon_vma = NULL; bool is_lru = !__PageMovable(page); @@ -1063,7 +1063,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma, page); try_to_migrate(page, 0); - page_was_mapped = 1; + page_was_mapped = true; } if (!page_mapped(page)) From 4b692e861619353ce069e547a67c8d0e32d9ef3d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Sep 2021 15:18:10 -0700 Subject: [PATCH 05/18] kexec: move locking into do_kexec_load Patch series "compat: remove compat_alloc_user_space", v5. Going through compat_alloc_user_space() to convert indirect system call arguments tends to add complexity compared to handling the native and compat logic in the same code. This patch (of 6): The locking is the same between the native and compat version of sys_kexec_load(), so it can be done in the common implementation to reduce duplication. Link: https://lkml.kernel.org/r/20210727144859.4150043-1-arnd@kernel.org Link: https://lkml.kernel.org/r/20210727144859.4150043-2-arnd@kernel.org Signed-off-by: Arnd Bergmann Co-developed-by: Eric Biederman Co-developed-by: Christoph Hellwig Acked-by: "Eric W. Biederman" Cc: Catalin Marinas Cc: Will Deacon Cc: Thomas Bogendoerfer Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: "David S. Miller" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Al Viro Cc: Feng Tang Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 44 ++++++++++++++++---------------------------- 1 file changed, 16 insertions(+), 28 deletions(-) diff --git a/kernel/kexec.c b/kernel/kexec.c index c82c6c06f051..9c7aef8f4bb6 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -110,6 +110,17 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, unsigned long i; int ret; + /* + * Because we write directly to the reserved memory region when loading + * crash kernels we need a mutex here to prevent multiple crash kernels + * from attempting to load simultaneously, and to prevent a crash kernel + * from loading over the top of a in use crash kernel. + * + * KISS: always take the mutex. + */ + if (!mutex_trylock(&kexec_mutex)) + return -EBUSY; + if (flags & KEXEC_ON_CRASH) { dest_image = &kexec_crash_image; if (kexec_crash_image) @@ -121,7 +132,8 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, if (nr_segments == 0) { /* Uninstall image */ kimage_free(xchg(dest_image, NULL)); - return 0; + ret = 0; + goto out_unlock; } if (flags & KEXEC_ON_CRASH) { /* @@ -134,7 +146,7 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, ret = kimage_alloc_init(&image, entry, nr_segments, segments, flags); if (ret) - return ret; + goto out_unlock; if (flags & KEXEC_PRESERVE_CONTEXT) image->preserve_context = 1; @@ -171,6 +183,8 @@ out: arch_kexec_protect_crashkres(); kimage_free(image); +out_unlock: + mutex_unlock(&kexec_mutex); return ret; } @@ -247,21 +261,8 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) return -EINVAL; - /* Because we write directly to the reserved memory - * region when loading crash kernels we need a mutex here to - * prevent multiple crash kernels from attempting to load - * simultaneously, and to prevent a crash kernel from loading - * over the top of a in use crash kernel. - * - * KISS: always take the mutex. - */ - if (!mutex_trylock(&kexec_mutex)) - return -EBUSY; - result = do_kexec_load(entry, nr_segments, segments, flags); - mutex_unlock(&kexec_mutex); - return result; } @@ -301,21 +302,8 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, return -EFAULT; } - /* Because we write directly to the reserved memory - * region when loading crash kernels we need a mutex here to - * prevent multiple crash kernels from attempting to load - * simultaneously, and to prevent a crash kernel from loading - * over the top of a in use crash kernel. - * - * KISS: always take the mutex. - */ - if (!mutex_trylock(&kexec_mutex)) - return -EBUSY; - result = do_kexec_load(entry, nr_segments, ksegments, flags); - mutex_unlock(&kexec_mutex); - return result; } #endif From 5d700a0fd71ded7096b97f01d276efc1a6579613 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Sep 2021 15:18:13 -0700 Subject: [PATCH 06/18] kexec: avoid compat_alloc_user_space kimage_alloc_init() expects a __user pointer, so compat_sys_kexec_load() uses compat_alloc_user_space() to convert the layout and put it back onto the user space caller stack. Moving the user space access into the syscall handler directly actually makes the code simpler, as the conversion for compat mode can now be done on kernel memory. Link: https://lkml.kernel.org/r/20210727144859.4150043-3-arnd@kernel.org Link: https://lore.kernel.org/lkml/YPbtsU4GX6PL7%2F42@infradead.org/ Link: https://lore.kernel.org/lkml/m1y2cbzmnw.fsf@fess.ebiederm.org/ Signed-off-by: Arnd Bergmann Co-developed-by: Eric Biederman Co-developed-by: Christoph Hellwig Acked-by: "Eric W. Biederman" Cc: Al Viro Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: "David S. Miller" Cc: Feng Tang Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Michael Ellerman Cc: Paul Mackerras Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 61 +++++++++++++++++++++----------------------------- 1 file changed, 25 insertions(+), 36 deletions(-) diff --git a/kernel/kexec.c b/kernel/kexec.c index 9c7aef8f4bb6..b5e40f069768 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -19,26 +19,9 @@ #include "kexec_internal.h" -static int copy_user_segment_list(struct kimage *image, - unsigned long nr_segments, - struct kexec_segment __user *segments) -{ - int ret; - size_t segment_bytes; - - /* Read in the segments */ - image->nr_segments = nr_segments; - segment_bytes = nr_segments * sizeof(*segments); - ret = copy_from_user(image->segment, segments, segment_bytes); - if (ret) - ret = -EFAULT; - - return ret; -} - static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, unsigned long nr_segments, - struct kexec_segment __user *segments, + struct kexec_segment *segments, unsigned long flags) { int ret; @@ -58,10 +41,8 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, return -ENOMEM; image->start = entry; - - ret = copy_user_segment_list(image, nr_segments, segments); - if (ret) - goto out_free_image; + image->nr_segments = nr_segments; + memcpy(image->segment, segments, nr_segments * sizeof(*segments)); if (kexec_on_panic) { /* Enable special crash kernel control page alloc policy. */ @@ -104,7 +85,7 @@ out_free_image: } static int do_kexec_load(unsigned long entry, unsigned long nr_segments, - struct kexec_segment __user *segments, unsigned long flags) + struct kexec_segment *segments, unsigned long flags) { struct kimage **dest_image, *image; unsigned long i; @@ -250,7 +231,8 @@ static inline int kexec_load_check(unsigned long nr_segments, SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, struct kexec_segment __user *, segments, unsigned long, flags) { - int result; + struct kexec_segment *ksegments; + unsigned long result; result = kexec_load_check(nr_segments, flags); if (result) @@ -261,7 +243,12 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) return -EINVAL; - result = do_kexec_load(entry, nr_segments, segments, flags); + ksegments = memdup_user(segments, nr_segments * sizeof(ksegments[0])); + if (IS_ERR(ksegments)) + return PTR_ERR(ksegments); + + result = do_kexec_load(entry, nr_segments, ksegments, flags); + kfree(ksegments); return result; } @@ -273,7 +260,7 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, compat_ulong_t, flags) { struct compat_kexec_segment in; - struct kexec_segment out, __user *ksegments; + struct kexec_segment *ksegments; unsigned long i, result; result = kexec_load_check(nr_segments, flags); @@ -286,24 +273,26 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) return -EINVAL; - ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); + ksegments = kmalloc_array(nr_segments, sizeof(ksegments[0]), + GFP_KERNEL); + if (!ksegments) + return -ENOMEM; + for (i = 0; i < nr_segments; i++) { result = copy_from_user(&in, &segments[i], sizeof(in)); if (result) - return -EFAULT; + goto fail; - out.buf = compat_ptr(in.buf); - out.bufsz = in.bufsz; - out.mem = in.mem; - out.memsz = in.memsz; - - result = copy_to_user(&ksegments[i], &out, sizeof(out)); - if (result) - return -EFAULT; + ksegments[i].buf = compat_ptr(in.buf); + ksegments[i].bufsz = in.bufsz; + ksegments[i].mem = in.mem; + ksegments[i].memsz = in.memsz; } result = do_kexec_load(entry, nr_segments, ksegments, flags); +fail: + kfree(ksegments); return result; } #endif From 5b1b561ba73c8ab9c98e5dfd14dc7ee47efb6530 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Sep 2021 15:18:17 -0700 Subject: [PATCH 07/18] mm: simplify compat_sys_move_pages The compat move_pages() implementation uses compat_alloc_user_space() for converting the pointer array. Moving the compat handling into the function itself is a bit simpler and lets us avoid the compat_alloc_user_space() call. Link: https://lkml.kernel.org/r/20210727144859.4150043-4-arnd@kernel.org Signed-off-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: "David S. Miller" Cc: Eric Biederman Cc: Feng Tang Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Michael Ellerman Cc: Paul Mackerras Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 45 ++++++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 24c2ff8fe4c0..2bc494875cea 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1900,6 +1900,23 @@ set_status: mmap_read_unlock(mm); } +static int get_compat_pages_array(const void __user *chunk_pages[], + const void __user * __user *pages, + unsigned long chunk_nr) +{ + compat_uptr_t __user *pages32 = (compat_uptr_t __user *)pages; + compat_uptr_t p; + int i; + + for (i = 0; i < chunk_nr; i++) { + if (get_user(p, pages32 + i)) + return -EFAULT; + chunk_pages[i] = compat_ptr(p); + } + + return 0; +} + /* * Determine the nodes of a user array of pages and store it in * a user array of status. @@ -1919,8 +1936,15 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, if (chunk_nr > DO_PAGES_STAT_CHUNK_NR) chunk_nr = DO_PAGES_STAT_CHUNK_NR; - if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages))) - break; + if (in_compat_syscall()) { + if (get_compat_pages_array(chunk_pages, pages, + chunk_nr)) + break; + } else { + if (copy_from_user(chunk_pages, pages, + chunk_nr * sizeof(*chunk_pages))) + break; + } do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); @@ -2025,23 +2049,14 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, - compat_uptr_t __user *, pages32, + compat_uptr_t __user *, pages, const int __user *, nodes, int __user *, status, int, flags) { - const void __user * __user *pages; - int i; - - pages = compat_alloc_user_space(nr_pages * sizeof(void *)); - for (i = 0; i < nr_pages; i++) { - compat_uptr_t p; - - if (get_user(p, pages32 + i) || - put_user(compat_ptr(p), pages + i)) - return -EFAULT; - } - return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags); + return kernel_move_pages(pid, nr_pages, + (const void __user *__user *)pages, + nodes, status, flags); } #endif /* CONFIG_COMPAT */ From e130242dc351f1cfa2bbeb6766a1486ce936ef88 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Sep 2021 15:18:21 -0700 Subject: [PATCH 08/18] mm: simplify compat numa syscalls The compat implementations for mbind, get_mempolicy, set_mempolicy and migrate_pages are just there to handle the subtly different layout of bitmaps on 32-bit hosts. The compat implementation however lacks some of the checks that are present in the native one, in particular for checking that the extra bits are all zero when user space has a larger mask size than the kernel. Worse, those extra bits do not get cleared when copying in or out of the kernel, which can lead to incorrect data as well. Unify the implementation to handle the compat bitmap layout directly in the get_nodes() and copy_nodes_to_user() helpers. Splitting out the get_bitmap() helper from get_nodes() also helps readability of the native case. On x86, two additional problems are addressed by this: compat tasks can pass a bitmap at the end of a mapping, causing a fault when reading across the page boundary for a 64-bit word. x32 tasks might also run into problems with get_mempolicy corrupting data when an odd number of 32-bit words gets passed. On parisc the migrate_pages() system call apparently had the wrong calling convention, as big-endian architectures expect the words inside of a bitmap to be swapped. This is not a problem though since parisc has no NUMA support. [arnd@arndb.de: fix mempolicy crash] Link: https://lkml.kernel.org/r/20210730143417.3700653-1-arnd@kernel.org Link: https://lore.kernel.org/lkml/YQPLG20V3dmOfq3a@osiris/ Link: https://lkml.kernel.org/r/20210727144859.4150043-5-arnd@kernel.org Signed-off-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: "David S. Miller" Cc: Eric Biederman Cc: Feng Tang Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Michael Ellerman Cc: Paul Mackerras Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compat.h | 17 ++-- mm/mempolicy.c | 176 +++++++++++++---------------------------- 2 files changed, 64 insertions(+), 129 deletions(-) diff --git a/include/linux/compat.h b/include/linux/compat.h index 8e0598c7d1d1..3a2ac5afee30 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -395,14 +395,6 @@ struct compat_kexec_segment; struct compat_mq_attr; struct compat_msgbuf; -#define BITS_PER_COMPAT_LONG (8*sizeof(compat_long_t)) - -#define BITS_TO_COMPAT_LONGS(bits) DIV_ROUND_UP(bits, BITS_PER_COMPAT_LONG) - -long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, - unsigned long bitmap_size); -long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, - unsigned long bitmap_size); void copy_siginfo_to_external32(struct compat_siginfo *to, const struct kernel_siginfo *from); int copy_siginfo_from_user32(kernel_siginfo_t *to, @@ -976,6 +968,15 @@ static inline bool in_compat_syscall(void) { return false; } #endif /* CONFIG_COMPAT */ +#define BITS_PER_COMPAT_LONG (8*sizeof(compat_long_t)) + +#define BITS_TO_COMPAT_LONGS(bits) DIV_ROUND_UP(bits, BITS_PER_COMPAT_LONG) + +long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, + unsigned long bitmap_size); +long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, + unsigned long bitmap_size); + /* * Some legacy ABIs like the i386 one use less than natural alignment for 64-bit * types, and will need special compat treatment for that. Most architectures diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 5e90b3fb7794..eb95578f5997 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1362,16 +1362,33 @@ mpol_out: /* * User space interface with variable sized bitmaps for nodelists. */ +static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask, + unsigned long maxnode) +{ + unsigned long nlongs = BITS_TO_LONGS(maxnode); + int ret; + + if (in_compat_syscall()) + ret = compat_get_bitmap(mask, + (const compat_ulong_t __user *)nmask, + maxnode); + else + ret = copy_from_user(mask, nmask, + nlongs * sizeof(unsigned long)); + + if (ret) + return -EFAULT; + + if (maxnode % BITS_PER_LONG) + mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1; + + return 0; +} /* Copy a node mask from user space. */ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, unsigned long maxnode) { - unsigned long k; - unsigned long t; - unsigned long nlongs; - unsigned long endmask; - --maxnode; nodes_clear(*nodes); if (maxnode == 0 || !nmask) @@ -1379,49 +1396,29 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, if (maxnode > PAGE_SIZE*BITS_PER_BYTE) return -EINVAL; - nlongs = BITS_TO_LONGS(maxnode); - if ((maxnode % BITS_PER_LONG) == 0) - endmask = ~0UL; - else - endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; - /* * When the user specified more nodes than supported just check - * if the non supported part is all zero. - * - * If maxnode have more longs than MAX_NUMNODES, check - * the bits in that area first. And then go through to - * check the rest bits which equal or bigger than MAX_NUMNODES. - * Otherwise, just check bits [MAX_NUMNODES, maxnode). + * if the non supported part is all zero, one word at a time, + * starting at the end. */ - if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { - for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { - if (get_user(t, nmask + k)) - return -EFAULT; - if (k == nlongs - 1) { - if (t & endmask) - return -EINVAL; - } else if (t) - return -EINVAL; - } - nlongs = BITS_TO_LONGS(MAX_NUMNODES); - endmask = ~0UL; - } + while (maxnode > MAX_NUMNODES) { + unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG); + unsigned long t; - if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) { - unsigned long valid_mask = endmask; - - valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1); - if (get_user(t, nmask + nlongs - 1)) + if (get_bitmap(&t, &nmask[maxnode / BITS_PER_LONG], bits)) return -EFAULT; - if (t & valid_mask) + + if (maxnode - bits >= MAX_NUMNODES) { + maxnode -= bits; + } else { + maxnode = MAX_NUMNODES; + t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1); + } + if (t) return -EINVAL; } - if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) - return -EFAULT; - nodes_addr(*nodes)[nlongs-1] &= endmask; - return 0; + return get_bitmap(nodes_addr(*nodes), nmask, maxnode); } /* Copy a kernel node mask to user space */ @@ -1430,6 +1427,10 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, { unsigned long copy = ALIGN(maxnode-1, 64) / 8; unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long); + bool compat = in_compat_syscall(); + + if (compat) + nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t); if (copy > nbytes) { if (copy > PAGE_SIZE) @@ -1437,7 +1438,13 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, if (clear_user((char __user *)mask + nbytes, copy - nbytes)) return -EFAULT; copy = nbytes; + maxnode = nr_node_ids; } + + if (compat) + return compat_put_bitmap((compat_ulong_t __user *)mask, + nodes_addr(*nodes), maxnode); + return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; } @@ -1649,72 +1656,22 @@ COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, compat_ulong_t, maxnode, compat_ulong_t, addr, compat_ulong_t, flags) { - long err; - unsigned long __user *nm = NULL; - unsigned long nr_bits, alloc_size; - DECLARE_BITMAP(bm, MAX_NUMNODES); - - nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids); - alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; - - if (nmask) - nm = compat_alloc_user_space(alloc_size); - - err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags); - - if (!err && nmask) { - unsigned long copy_size; - copy_size = min_t(unsigned long, sizeof(bm), alloc_size); - err = copy_from_user(bm, nm, copy_size); - /* ensure entire bitmap is zeroed */ - err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); - err |= compat_put_bitmap(nmask, bm, nr_bits); - } - - return err; + return kernel_get_mempolicy(policy, (unsigned long __user *)nmask, + maxnode, addr, flags); } COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask, compat_ulong_t, maxnode) { - unsigned long __user *nm = NULL; - unsigned long nr_bits, alloc_size; - DECLARE_BITMAP(bm, MAX_NUMNODES); - - nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); - alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; - - if (nmask) { - if (compat_get_bitmap(bm, nmask, nr_bits)) - return -EFAULT; - nm = compat_alloc_user_space(alloc_size); - if (copy_to_user(nm, bm, alloc_size)) - return -EFAULT; - } - - return kernel_set_mempolicy(mode, nm, nr_bits+1); + return kernel_set_mempolicy(mode, (unsigned long __user *)nmask, maxnode); } COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, compat_ulong_t, mode, compat_ulong_t __user *, nmask, compat_ulong_t, maxnode, compat_ulong_t, flags) { - unsigned long __user *nm = NULL; - unsigned long nr_bits, alloc_size; - nodemask_t bm; - - nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); - alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; - - if (nmask) { - if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits)) - return -EFAULT; - nm = compat_alloc_user_space(alloc_size); - if (copy_to_user(nm, nodes_addr(bm), alloc_size)) - return -EFAULT; - } - - return kernel_mbind(start, len, mode, nm, nr_bits+1, flags); + return kernel_mbind(start, len, mode, (unsigned long __user *)nmask, + maxnode, flags); } COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid, @@ -1722,32 +1679,9 @@ COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid, const compat_ulong_t __user *, old_nodes, const compat_ulong_t __user *, new_nodes) { - unsigned long __user *old = NULL; - unsigned long __user *new = NULL; - nodemask_t tmp_mask; - unsigned long nr_bits; - unsigned long size; - - nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES); - size = ALIGN(nr_bits, BITS_PER_LONG) / 8; - if (old_nodes) { - if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits)) - return -EFAULT; - old = compat_alloc_user_space(new_nodes ? size * 2 : size); - if (new_nodes) - new = old + size / sizeof(unsigned long); - if (copy_to_user(old, nodes_addr(tmp_mask), size)) - return -EFAULT; - } - if (new_nodes) { - if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits)) - return -EFAULT; - if (new == NULL) - new = compat_alloc_user_space(size); - if (copy_to_user(new, nodes_addr(tmp_mask), size)) - return -EFAULT; - } - return kernel_migrate_pages(pid, nr_bits + 1, old, new); + return kernel_migrate_pages(pid, maxnode, + (const unsigned long __user *)old_nodes, + (const unsigned long __user *)new_nodes); } #endif /* CONFIG_COMPAT */ From 59ab844eed9c6b01d32dcb27b57accc23771b324 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Sep 2021 15:18:25 -0700 Subject: [PATCH 09/18] compat: remove some compat entry points These are all handled correctly when calling the native system call entry point, so remove the special cases. Link: https://lkml.kernel.org/r/20210727144859.4150043-6-arnd@kernel.org Signed-off-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: "David S. Miller" Cc: Eric Biederman Cc: Feng Tang Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Michael Ellerman Cc: Paul Mackerras Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/unistd32.h | 10 +++--- arch/mips/kernel/syscalls/syscall_n32.tbl | 10 +++--- arch/mips/kernel/syscalls/syscall_o32.tbl | 10 +++--- arch/parisc/kernel/syscalls/syscall.tbl | 8 ++--- arch/powerpc/kernel/syscalls/syscall.tbl | 10 +++--- arch/s390/kernel/syscalls/syscall.tbl | 10 +++--- arch/sparc/kernel/syscalls/syscall.tbl | 10 +++--- arch/x86/entry/syscalls/syscall_32.tbl | 4 +-- arch/x86/entry/syscalls/syscall_64.tbl | 2 +- include/linux/compat.h | 20 ------------ include/uapi/asm-generic/unistd.h | 10 +++--- kernel/sys_ni.c | 5 --- mm/mempolicy.c | 37 ----------------------- mm/migrate.c | 13 -------- 14 files changed, 42 insertions(+), 117 deletions(-) diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 4e99e4b912ef..844f6ae58662 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -649,11 +649,11 @@ __SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch) #define __NR_inotify_rm_watch 318 __SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch) #define __NR_mbind 319 -__SYSCALL(__NR_mbind, compat_sys_mbind) +__SYSCALL(__NR_mbind, sys_mbind) #define __NR_get_mempolicy 320 -__SYSCALL(__NR_get_mempolicy, compat_sys_get_mempolicy) +__SYSCALL(__NR_get_mempolicy, sys_get_mempolicy) #define __NR_set_mempolicy 321 -__SYSCALL(__NR_set_mempolicy, compat_sys_set_mempolicy) +__SYSCALL(__NR_set_mempolicy, sys_set_mempolicy) #define __NR_openat 322 __SYSCALL(__NR_openat, compat_sys_openat) #define __NR_mkdirat 323 @@ -699,7 +699,7 @@ __SYSCALL(__NR_tee, sys_tee) #define __NR_vmsplice 343 __SYSCALL(__NR_vmsplice, sys_vmsplice) #define __NR_move_pages 344 -__SYSCALL(__NR_move_pages, compat_sys_move_pages) +__SYSCALL(__NR_move_pages, sys_move_pages) #define __NR_getcpu 345 __SYSCALL(__NR_getcpu, sys_getcpu) #define __NR_epoll_pwait 346 @@ -811,7 +811,7 @@ __SYSCALL(__NR_rseq, sys_rseq) #define __NR_io_pgetevents 399 __SYSCALL(__NR_io_pgetevents, compat_sys_io_pgetevents) #define __NR_migrate_pages 400 -__SYSCALL(__NR_migrate_pages, compat_sys_migrate_pages) +__SYSCALL(__NR_migrate_pages, sys_migrate_pages) #define __NR_kexec_file_load 401 __SYSCALL(__NR_kexec_file_load, sys_kexec_file_load) /* 402 is unused */ diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 56c8d3cf42ed..70e32de2bcaa 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -239,9 +239,9 @@ 228 n32 clock_nanosleep sys_clock_nanosleep_time32 229 n32 tgkill sys_tgkill 230 n32 utimes sys_utimes_time32 -231 n32 mbind compat_sys_mbind -232 n32 get_mempolicy compat_sys_get_mempolicy -233 n32 set_mempolicy compat_sys_set_mempolicy +231 n32 mbind sys_mbind +232 n32 get_mempolicy sys_get_mempolicy +233 n32 set_mempolicy sys_set_mempolicy 234 n32 mq_open compat_sys_mq_open 235 n32 mq_unlink sys_mq_unlink 236 n32 mq_timedsend sys_mq_timedsend_time32 @@ -258,7 +258,7 @@ 247 n32 inotify_init sys_inotify_init 248 n32 inotify_add_watch sys_inotify_add_watch 249 n32 inotify_rm_watch sys_inotify_rm_watch -250 n32 migrate_pages compat_sys_migrate_pages +250 n32 migrate_pages sys_migrate_pages 251 n32 openat sys_openat 252 n32 mkdirat sys_mkdirat 253 n32 mknodat sys_mknodat @@ -279,7 +279,7 @@ 268 n32 sync_file_range sys_sync_file_range 269 n32 tee sys_tee 270 n32 vmsplice sys_vmsplice -271 n32 move_pages compat_sys_move_pages +271 n32 move_pages sys_move_pages 272 n32 set_robust_list compat_sys_set_robust_list 273 n32 get_robust_list compat_sys_get_robust_list 274 n32 kexec_load compat_sys_kexec_load diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 201237fd0f43..a61c35edaa74 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -279,9 +279,9 @@ 265 o32 clock_nanosleep sys_clock_nanosleep_time32 266 o32 tgkill sys_tgkill 267 o32 utimes sys_utimes_time32 -268 o32 mbind sys_mbind compat_sys_mbind -269 o32 get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy -270 o32 set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy +268 o32 mbind sys_mbind +269 o32 get_mempolicy sys_get_mempolicy +270 o32 set_mempolicy sys_set_mempolicy 271 o32 mq_open sys_mq_open compat_sys_mq_open 272 o32 mq_unlink sys_mq_unlink 273 o32 mq_timedsend sys_mq_timedsend_time32 @@ -298,7 +298,7 @@ 284 o32 inotify_init sys_inotify_init 285 o32 inotify_add_watch sys_inotify_add_watch 286 o32 inotify_rm_watch sys_inotify_rm_watch -287 o32 migrate_pages sys_migrate_pages compat_sys_migrate_pages +287 o32 migrate_pages sys_migrate_pages 288 o32 openat sys_openat compat_sys_openat 289 o32 mkdirat sys_mkdirat 290 o32 mknodat sys_mknodat @@ -319,7 +319,7 @@ 305 o32 sync_file_range sys_sync_file_range sys32_sync_file_range 306 o32 tee sys_tee 307 o32 vmsplice sys_vmsplice -308 o32 move_pages sys_move_pages compat_sys_move_pages +308 o32 move_pages sys_move_pages 309 o32 set_robust_list sys_set_robust_list compat_sys_set_robust_list 310 o32 get_robust_list sys_get_robust_list compat_sys_get_robust_list 311 o32 kexec_load sys_kexec_load compat_sys_kexec_load diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 0bf854b70612..bf751e0732b7 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -292,9 +292,9 @@ 258 32 clock_nanosleep sys_clock_nanosleep_time32 258 64 clock_nanosleep sys_clock_nanosleep 259 common tgkill sys_tgkill -260 common mbind sys_mbind compat_sys_mbind -261 common get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy -262 common set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy +260 common mbind sys_mbind +261 common get_mempolicy sys_get_mempolicy +262 common set_mempolicy sys_set_mempolicy # 263 was vserver 264 common add_key sys_add_key 265 common request_key sys_request_key @@ -331,7 +331,7 @@ 292 64 sync_file_range sys_sync_file_range 293 common tee sys_tee 294 common vmsplice sys_vmsplice -295 common move_pages sys_move_pages compat_sys_move_pages +295 common move_pages sys_move_pages 296 common getcpu sys_getcpu 297 common epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait 298 common statfs64 sys_statfs64 compat_sys_statfs64 diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 29b55e2e035c..7bef917cc84e 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -330,10 +330,10 @@ 256 64 sys_debug_setcontext sys_ni_syscall 256 spu sys_debug_setcontext sys_ni_syscall # 257 reserved for vserver -258 nospu migrate_pages sys_migrate_pages compat_sys_migrate_pages -259 nospu mbind sys_mbind compat_sys_mbind -260 nospu get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy -261 nospu set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy +258 nospu migrate_pages sys_migrate_pages +259 nospu mbind sys_mbind +260 nospu get_mempolicy sys_get_mempolicy +261 nospu set_mempolicy sys_set_mempolicy 262 nospu mq_open sys_mq_open compat_sys_mq_open 263 nospu mq_unlink sys_mq_unlink 264 32 mq_timedsend sys_mq_timedsend_time32 @@ -381,7 +381,7 @@ 298 common faccessat sys_faccessat 299 common get_robust_list sys_get_robust_list compat_sys_get_robust_list 300 common set_robust_list sys_set_robust_list compat_sys_set_robust_list -301 common move_pages sys_move_pages compat_sys_move_pages +301 common move_pages sys_move_pages 302 common getcpu sys_getcpu 303 nospu epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait 304 32 utimensat sys_utimensat_time32 diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index aa9d68b8ee14..df5261e5cfe1 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -274,9 +274,9 @@ 265 common statfs64 sys_statfs64 compat_sys_statfs64 266 common fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 267 common remap_file_pages sys_remap_file_pages sys_remap_file_pages -268 common mbind sys_mbind compat_sys_mbind -269 common get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy -270 common set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy +268 common mbind sys_mbind sys_mbind +269 common get_mempolicy sys_get_mempolicy sys_get_mempolicy +270 common set_mempolicy sys_set_mempolicy sys_set_mempolicy 271 common mq_open sys_mq_open compat_sys_mq_open 272 common mq_unlink sys_mq_unlink sys_mq_unlink 273 common mq_timedsend sys_mq_timedsend sys_mq_timedsend_time32 @@ -293,7 +293,7 @@ 284 common inotify_init sys_inotify_init sys_inotify_init 285 common inotify_add_watch sys_inotify_add_watch sys_inotify_add_watch 286 common inotify_rm_watch sys_inotify_rm_watch sys_inotify_rm_watch -287 common migrate_pages sys_migrate_pages compat_sys_migrate_pages +287 common migrate_pages sys_migrate_pages sys_migrate_pages 288 common openat sys_openat compat_sys_openat 289 common mkdirat sys_mkdirat sys_mkdirat 290 common mknodat sys_mknodat sys_mknodat @@ -317,7 +317,7 @@ 307 common sync_file_range sys_sync_file_range compat_sys_s390_sync_file_range 308 common tee sys_tee sys_tee 309 common vmsplice sys_vmsplice sys_vmsplice -310 common move_pages sys_move_pages compat_sys_move_pages +310 common move_pages sys_move_pages sys_move_pages 311 common getcpu sys_getcpu sys_getcpu 312 common epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait 313 common utimes sys_utimes sys_utimes_time32 diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 7893104718c2..c37764dc764d 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -365,12 +365,12 @@ 299 common unshare sys_unshare 300 common set_robust_list sys_set_robust_list compat_sys_set_robust_list 301 common get_robust_list sys_get_robust_list compat_sys_get_robust_list -302 common migrate_pages sys_migrate_pages compat_sys_migrate_pages -303 common mbind sys_mbind compat_sys_mbind -304 common get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy -305 common set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy +302 common migrate_pages sys_migrate_pages +303 common mbind sys_mbind +304 common get_mempolicy sys_get_mempolicy +305 common set_mempolicy sys_set_mempolicy 306 common kexec_load sys_kexec_load compat_sys_kexec_load -307 common move_pages sys_move_pages compat_sys_move_pages +307 common move_pages sys_move_pages 308 common getcpu sys_getcpu 309 common epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait 310 32 utimensat sys_utimensat_time32 diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 61f18b72552b..960a021d543e 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -286,7 +286,7 @@ 272 i386 fadvise64_64 sys_ia32_fadvise64_64 273 i386 vserver 274 i386 mbind sys_mbind -275 i386 get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy +275 i386 get_mempolicy sys_get_mempolicy 276 i386 set_mempolicy sys_set_mempolicy 277 i386 mq_open sys_mq_open compat_sys_mq_open 278 i386 mq_unlink sys_mq_unlink @@ -328,7 +328,7 @@ 314 i386 sync_file_range sys_ia32_sync_file_range 315 i386 tee sys_tee 316 i386 vmsplice sys_vmsplice -317 i386 move_pages sys_move_pages compat_sys_move_pages +317 i386 move_pages sys_move_pages 318 i386 getcpu sys_getcpu 319 i386 epoll_pwait sys_epoll_pwait 320 i386 utimensat sys_utimensat_time32 diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 807b6a1de8e8..18b5500ea8bf 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -398,7 +398,7 @@ 530 x32 set_robust_list compat_sys_set_robust_list 531 x32 get_robust_list compat_sys_get_robust_list 532 x32 vmsplice sys_vmsplice -533 x32 move_pages compat_sys_move_pages +533 x32 move_pages sys_move_pages 534 x32 preadv compat_sys_preadv64 535 x32 pwritev compat_sys_pwritev64 536 x32 rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo diff --git a/include/linux/compat.h b/include/linux/compat.h index 3a2ac5afee30..2d42cebd1fb8 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -799,26 +799,6 @@ asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr /* mm/fadvise.c: No generic prototype for fadvise64_64 */ /* mm/, CONFIG_MMU only */ -asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, - compat_ulong_t mode, - compat_ulong_t __user *nmask, - compat_ulong_t maxnode, compat_ulong_t flags); -asmlinkage long compat_sys_get_mempolicy(int __user *policy, - compat_ulong_t __user *nmask, - compat_ulong_t maxnode, - compat_ulong_t addr, - compat_ulong_t flags); -asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, - compat_ulong_t maxnode); -asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, - compat_ulong_t maxnode, const compat_ulong_t __user *old_nodes, - const compat_ulong_t __user *new_nodes); -asmlinkage long compat_sys_move_pages(pid_t pid, compat_ulong_t nr_pages, - __u32 __user *pages, - const int __user *nodes, - int __user *status, - int flags); - asmlinkage long compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig, struct compat_siginfo __user *uinfo); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 14c8fe863c6d..1c5fb86d455a 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -673,15 +673,15 @@ __SYSCALL(__NR_madvise, sys_madvise) #define __NR_remap_file_pages 234 __SYSCALL(__NR_remap_file_pages, sys_remap_file_pages) #define __NR_mbind 235 -__SC_COMP(__NR_mbind, sys_mbind, compat_sys_mbind) +__SYSCALL(__NR_mbind, sys_mbind) #define __NR_get_mempolicy 236 -__SC_COMP(__NR_get_mempolicy, sys_get_mempolicy, compat_sys_get_mempolicy) +__SYSCALL(__NR_get_mempolicy, sys_get_mempolicy) #define __NR_set_mempolicy 237 -__SC_COMP(__NR_set_mempolicy, sys_set_mempolicy, compat_sys_set_mempolicy) +__SYSCALL(__NR_set_mempolicy, sys_set_mempolicy) #define __NR_migrate_pages 238 -__SC_COMP(__NR_migrate_pages, sys_migrate_pages, compat_sys_migrate_pages) +__SYSCALL(__NR_migrate_pages, sys_migrate_pages) #define __NR_move_pages 239 -__SC_COMP(__NR_move_pages, sys_move_pages, compat_sys_move_pages) +__SYSCALL(__NR_move_pages, sys_move_pages) #endif #define __NR_rt_tgsigqueueinfo 240 diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 64578adfe115..f43d89d92860 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -292,15 +292,10 @@ COND_SYSCALL(process_madvise); COND_SYSCALL(process_mrelease); COND_SYSCALL(remap_file_pages); COND_SYSCALL(mbind); -COND_SYSCALL_COMPAT(mbind); COND_SYSCALL(get_mempolicy); -COND_SYSCALL_COMPAT(get_mempolicy); COND_SYSCALL(set_mempolicy); -COND_SYSCALL_COMPAT(set_mempolicy); COND_SYSCALL(migrate_pages); -COND_SYSCALL_COMPAT(migrate_pages); COND_SYSCALL(move_pages); -COND_SYSCALL_COMPAT(move_pages); COND_SYSCALL(perf_event_open); COND_SYSCALL(accept4); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index eb95578f5997..8d14240896a8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1649,43 +1649,6 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags); } -#ifdef CONFIG_COMPAT - -COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, - compat_ulong_t __user *, nmask, - compat_ulong_t, maxnode, - compat_ulong_t, addr, compat_ulong_t, flags) -{ - return kernel_get_mempolicy(policy, (unsigned long __user *)nmask, - maxnode, addr, flags); -} - -COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask, - compat_ulong_t, maxnode) -{ - return kernel_set_mempolicy(mode, (unsigned long __user *)nmask, maxnode); -} - -COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, - compat_ulong_t, mode, compat_ulong_t __user *, nmask, - compat_ulong_t, maxnode, compat_ulong_t, flags) -{ - return kernel_mbind(start, len, mode, (unsigned long __user *)nmask, - maxnode, flags); -} - -COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid, - compat_ulong_t, maxnode, - const compat_ulong_t __user *, old_nodes, - const compat_ulong_t __user *, new_nodes) -{ - return kernel_migrate_pages(pid, maxnode, - (const unsigned long __user *)old_nodes, - (const unsigned long __user *)new_nodes); -} - -#endif /* CONFIG_COMPAT */ - bool vma_migratable(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_IO | VM_PFNMAP)) diff --git a/mm/migrate.c b/mm/migrate.c index 2bc494875cea..a6a7743ee98f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2047,19 +2047,6 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags); } -#ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, - compat_uptr_t __user *, pages, - const int __user *, nodes, - int __user *, status, - int, flags) -{ - return kernel_move_pages(pid, nr_pages, - (const void __user *__user *)pages, - nodes, status, flags); -} -#endif /* CONFIG_COMPAT */ - #ifdef CONFIG_NUMA_BALANCING /* * Returns true if this is a safe migration target node for misplaced NUMA From a7a08b275a8bbade798c4bdaad07ade68fe7003c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Sep 2021 15:18:29 -0700 Subject: [PATCH 10/18] arch: remove compat_alloc_user_space All users of compat_alloc_user_space() and copy_in_user() have been removed from the kernel, only a few functions in sparc remain that can be changed to calling arch_copy_in_user() instead. Link: https://lkml.kernel.org/r/20210727144859.4150043-7-arnd@kernel.org Signed-off-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: "David S. Miller" Cc: Eric Biederman Cc: Feng Tang Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Michael Ellerman Cc: Paul Mackerras Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/compat.h | 5 -- arch/arm64/include/asm/uaccess.h | 11 ---- arch/arm64/lib/Makefile | 2 +- arch/arm64/lib/copy_in_user.S | 77 ------------------------- arch/mips/cavium-octeon/octeon-memcpy.S | 2 - arch/mips/include/asm/compat.h | 8 --- arch/mips/include/asm/uaccess.h | 26 --------- arch/mips/lib/memcpy.S | 11 ---- arch/parisc/include/asm/compat.h | 6 -- arch/parisc/include/asm/uaccess.h | 2 - arch/parisc/lib/memcpy.c | 9 --- arch/powerpc/include/asm/compat.h | 16 ----- arch/s390/include/asm/compat.h | 10 ---- arch/s390/include/asm/uaccess.h | 3 - arch/s390/lib/uaccess.c | 63 -------------------- arch/sparc/include/asm/compat.h | 19 ------ arch/sparc/kernel/process_64.c | 2 +- arch/sparc/kernel/signal32.c | 12 ++-- arch/sparc/kernel/signal_64.c | 8 +-- arch/x86/include/asm/compat.h | 13 ----- arch/x86/include/asm/uaccess_64.h | 7 --- include/linux/compat.h | 2 - include/linux/uaccess.h | 10 ---- kernel/compat.c | 21 ------- 24 files changed, 12 insertions(+), 333 deletions(-) delete mode 100644 arch/arm64/lib/copy_in_user.S diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h index 79c1a750e357..eaa6ca062d89 100644 --- a/arch/arm64/include/asm/compat.h +++ b/arch/arm64/include/asm/compat.h @@ -107,11 +107,6 @@ struct compat_statfs { #define compat_user_stack_pointer() (user_stack_pointer(task_pt_regs(current))) #define COMPAT_MINSIGSTKSZ 2048 -static inline void __user *arch_compat_alloc_user_space(long len) -{ - return (void __user *)compat_user_stack_pointer() - len; -} - struct compat_ipc64_perm { compat_key_t key; __compat_uid32_t uid; diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index b5f08621fa29..190b494e22ab 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -430,17 +430,6 @@ extern unsigned long __must_check __arch_copy_to_user(void __user *to, const voi __actu_ret; \ }) -extern unsigned long __must_check __arch_copy_in_user(void __user *to, const void __user *from, unsigned long n); -#define raw_copy_in_user(to, from, n) \ -({ \ - unsigned long __aciu_ret; \ - uaccess_ttbr0_enable(); \ - __aciu_ret = __arch_copy_in_user(__uaccess_mask_ptr(to), \ - __uaccess_mask_ptr(from), (n)); \ - uaccess_ttbr0_disable(); \ - __aciu_ret; \ -}) - #define INLINE_COPY_TO_USER #define INLINE_COPY_FROM_USER diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 6dd56a49790a..0941180a86d3 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 lib-y := clear_user.o delay.o copy_from_user.o \ - copy_to_user.o copy_in_user.o copy_page.o \ + copy_to_user.o copy_page.o \ clear_page.o csum.o insn.o memchr.o memcpy.o \ memset.o memcmp.o strcmp.o strncmp.o strlen.o \ strnlen.o strchr.o strrchr.o tishift.o diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S deleted file mode 100644 index dbea3799c3ef..000000000000 --- a/arch/arm64/lib/copy_in_user.S +++ /dev/null @@ -1,77 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copy from user space to user space - * - * Copyright (C) 2012 ARM Ltd. - */ - -#include - -#include -#include -#include - -/* - * Copy from user space to user space (alignment handled by the hardware) - * - * Parameters: - * x0 - to - * x1 - from - * x2 - n - * Returns: - * x0 - bytes not copied - */ - .macro ldrb1 reg, ptr, val - user_ldst 9998f, ldtrb, \reg, \ptr, \val - .endm - - .macro strb1 reg, ptr, val - user_ldst 9998f, sttrb, \reg, \ptr, \val - .endm - - .macro ldrh1 reg, ptr, val - user_ldst 9997f, ldtrh, \reg, \ptr, \val - .endm - - .macro strh1 reg, ptr, val - user_ldst 9997f, sttrh, \reg, \ptr, \val - .endm - - .macro ldr1 reg, ptr, val - user_ldst 9997f, ldtr, \reg, \ptr, \val - .endm - - .macro str1 reg, ptr, val - user_ldst 9997f, sttr, \reg, \ptr, \val - .endm - - .macro ldp1 reg1, reg2, ptr, val - user_ldp 9997f, \reg1, \reg2, \ptr, \val - .endm - - .macro stp1 reg1, reg2, ptr, val - user_stp 9997f, \reg1, \reg2, \ptr, \val - .endm - -end .req x5 -srcin .req x15 -SYM_FUNC_START(__arch_copy_in_user) - add end, x0, x2 - mov srcin, x1 -#include "copy_template.S" - mov x0, #0 - ret -SYM_FUNC_END(__arch_copy_in_user) -EXPORT_SYMBOL(__arch_copy_in_user) - - .section .fixup,"ax" - .align 2 -9997: cmp dst, dstin - b.ne 9998f - // Before being absolutely sure we couldn't copy anything, try harder -USER(9998f, ldtrb tmp1w, [srcin]) -USER(9998f, sttrb tmp1w, [dst]) - add dst, dst, #1 -9998: sub x0, end, dst // bytes not copied - ret - .previous diff --git a/arch/mips/cavium-octeon/octeon-memcpy.S b/arch/mips/cavium-octeon/octeon-memcpy.S index 600d018cf354..0a515cde1c18 100644 --- a/arch/mips/cavium-octeon/octeon-memcpy.S +++ b/arch/mips/cavium-octeon/octeon-memcpy.S @@ -154,8 +154,6 @@ FEXPORT(__raw_copy_from_user) EXPORT_SYMBOL(__raw_copy_from_user) FEXPORT(__raw_copy_to_user) EXPORT_SYMBOL(__raw_copy_to_user) -FEXPORT(__raw_copy_in_user) -EXPORT_SYMBOL(__raw_copy_in_user) /* * Note: dst & src may be unaligned, len may be 0 * Temps diff --git a/arch/mips/include/asm/compat.h b/arch/mips/include/asm/compat.h index 53f015a1b0a7..bbb3bc5a42fd 100644 --- a/arch/mips/include/asm/compat.h +++ b/arch/mips/include/asm/compat.h @@ -96,14 +96,6 @@ struct compat_statfs { #define COMPAT_OFF_T_MAX 0x7fffffff -static inline void __user *arch_compat_alloc_user_space(long len) -{ - struct pt_regs *regs = (struct pt_regs *) - ((unsigned long) current_thread_info() + THREAD_SIZE - 32) - 1; - - return (void __user *) (regs->regs[29] - len); -} - struct compat_ipc64_perm { compat_key_t key; __compat_uid32_t uid; diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h index 783fecce65c8..f8f74f9f5883 100644 --- a/arch/mips/include/asm/uaccess.h +++ b/arch/mips/include/asm/uaccess.h @@ -428,7 +428,6 @@ do { \ extern size_t __raw_copy_from_user(void *__to, const void *__from, size_t __n); extern size_t __raw_copy_to_user(void *__to, const void *__from, size_t __n); -extern size_t __raw_copy_in_user(void *__to, const void *__from, size_t __n); static inline unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n) @@ -480,31 +479,6 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n) #define INLINE_COPY_FROM_USER #define INLINE_COPY_TO_USER -static inline unsigned long -raw_copy_in_user(void __user *to, const void __user *from, unsigned long n) -{ - register void __user *__cu_to_r __asm__("$4"); - register const void __user *__cu_from_r __asm__("$5"); - register long __cu_len_r __asm__("$6"); - - __cu_to_r = to; - __cu_from_r = from; - __cu_len_r = n; - - __asm__ __volatile__( - ".set\tnoreorder\n\t" - __MODULE_JAL(__raw_copy_in_user) - ".set\tnoat\n\t" - __UA_ADDU "\t$1, %1, %2\n\t" - ".set\tat\n\t" - ".set\treorder" - : "+r" (__cu_to_r), "+r" (__cu_from_r), "+r" (__cu_len_r) - : - : "$8", "$9", "$10", "$11", "$12", "$14", "$15", "$24", "$31", - DADDI_SCRATCH, "memory"); - return __cu_len_r; -} - extern __kernel_size_t __bzero(void __user *addr, __kernel_size_t size); /* diff --git a/arch/mips/lib/memcpy.S b/arch/mips/lib/memcpy.S index e19fb98b5d38..277c32296636 100644 --- a/arch/mips/lib/memcpy.S +++ b/arch/mips/lib/memcpy.S @@ -666,8 +666,6 @@ FEXPORT(__raw_copy_from_user) EXPORT_SYMBOL(__raw_copy_from_user) FEXPORT(__raw_copy_to_user) EXPORT_SYMBOL(__raw_copy_to_user) -FEXPORT(__raw_copy_in_user) -EXPORT_SYMBOL(__raw_copy_in_user) #endif /* Legacy Mode, user <-> user */ __BUILD_COPY_USER LEGACY_MODE USEROP USEROP @@ -703,13 +701,4 @@ EXPORT_SYMBOL(__raw_copy_to_user) __BUILD_COPY_USER EVA_MODE KERNELOP USEROP END(__raw_copy_to_user) -/* - * __copy_in_user (EVA) - */ - -LEAF(__raw_copy_in_user) -EXPORT_SYMBOL(__raw_copy_in_user) -__BUILD_COPY_USER EVA_MODE USEROP USEROP -END(__raw_copy_in_user) - #endif diff --git a/arch/parisc/include/asm/compat.h b/arch/parisc/include/asm/compat.h index b5d90e82b65d..c04f5a637c39 100644 --- a/arch/parisc/include/asm/compat.h +++ b/arch/parisc/include/asm/compat.h @@ -163,12 +163,6 @@ struct compat_shmid64_ds { #define COMPAT_ELF_NGREG 80 typedef compat_ulong_t compat_elf_gregset_t[COMPAT_ELF_NGREG]; -static __inline__ void __user *arch_compat_alloc_user_space(long len) -{ - struct pt_regs *regs = ¤t->thread.regs; - return (void __user *)regs->gr[30]; -} - static inline int __is_compat_task(struct task_struct *t) { return test_tsk_thread_flag(t, TIF_32BIT); diff --git a/arch/parisc/include/asm/uaccess.h b/arch/parisc/include/asm/uaccess.h index ed2cd4fb479b..7c13314aae4a 100644 --- a/arch/parisc/include/asm/uaccess.h +++ b/arch/parisc/include/asm/uaccess.h @@ -215,8 +215,6 @@ unsigned long __must_check raw_copy_to_user(void __user *dst, const void *src, unsigned long len); unsigned long __must_check raw_copy_from_user(void *dst, const void __user *src, unsigned long len); -unsigned long __must_check raw_copy_in_user(void __user *dst, const void __user *src, - unsigned long len); #define INLINE_COPY_TO_USER #define INLINE_COPY_FROM_USER diff --git a/arch/parisc/lib/memcpy.c b/arch/parisc/lib/memcpy.c index 4b75388190b4..ea70a0e08321 100644 --- a/arch/parisc/lib/memcpy.c +++ b/arch/parisc/lib/memcpy.c @@ -38,14 +38,6 @@ unsigned long raw_copy_from_user(void *dst, const void __user *src, } EXPORT_SYMBOL(raw_copy_from_user); -unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigned long len) -{ - mtsp(get_user_space(), 1); - mtsp(get_user_space(), 2); - return pa_memcpy((void __force *)dst, (void __force *)src, len); -} - - void * memcpy(void * dst,const void *src, size_t count) { mtsp(get_kernel_space(), 1); @@ -54,7 +46,6 @@ void * memcpy(void * dst,const void *src, size_t count) return dst; } -EXPORT_SYMBOL(raw_copy_in_user); EXPORT_SYMBOL(memcpy); bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) diff --git a/arch/powerpc/include/asm/compat.h b/arch/powerpc/include/asm/compat.h index e33dcf134cdd..7afc96fb6524 100644 --- a/arch/powerpc/include/asm/compat.h +++ b/arch/powerpc/include/asm/compat.h @@ -83,22 +83,6 @@ struct compat_statfs { #define COMPAT_OFF_T_MAX 0x7fffffff -static inline void __user *arch_compat_alloc_user_space(long len) -{ - struct pt_regs *regs = current->thread.regs; - unsigned long usp = regs->gpr[1]; - - /* - * We can't access below the stack pointer in the 32bit ABI and - * can access 288 bytes in the 64bit big-endian ABI, - * or 512 bytes with the new ELFv2 little-endian ABI. - */ - if (!is_32bit_task()) - usp -= USER_REDZONE_SIZE; - - return (void __user *) (usp - len); -} - /* * ipc64_perm is actually 32/64bit clean but since the compat layer refers to * it we may as well define it. diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h index 8d49505b4a43..cdc7ae72529d 100644 --- a/arch/s390/include/asm/compat.h +++ b/arch/s390/include/asm/compat.h @@ -176,16 +176,6 @@ static inline int is_compat_task(void) return test_thread_flag(TIF_31BIT); } -static inline void __user *arch_compat_alloc_user_space(long len) -{ - unsigned long stack; - - stack = KSTK_ESP(current); - if (is_compat_task()) - stack &= 0x7fffffffUL; - return (void __user *) (stack - len); -} - #endif struct compat_ipc64_perm { diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index 9ed9aa37e836..ce550d06abc3 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -227,9 +227,6 @@ static inline int __get_user_fn(void *x, const void __user *ptr, unsigned long s __get_user(x, ptr); \ }) -unsigned long __must_check -raw_copy_in_user(void __user *to, const void __user *from, unsigned long n); - /* * Copy a null terminated string from userspace. */ diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index 94ca99bde59d..a596e69d3c47 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -204,69 +204,6 @@ unsigned long raw_copy_to_user(void __user *to, const void *from, unsigned long } EXPORT_SYMBOL(raw_copy_to_user); -static inline unsigned long copy_in_user_mvcos(void __user *to, const void __user *from, - unsigned long size) -{ - unsigned long tmp1, tmp2; - - tmp1 = -4096UL; - /* FIXME: copy with reduced length. */ - asm volatile( - " lgr 0,%[spec]\n" - "0: .insn ss,0xc80000000000,0(%0,%1),0(%2),0\n" - " jz 2f\n" - "1: algr %0,%3\n" - " slgr %1,%3\n" - " slgr %2,%3\n" - " j 0b\n" - "2:slgr %0,%0\n" - "3: \n" - EX_TABLE(0b,3b) - : "+a" (size), "+a" (to), "+a" (from), "+a" (tmp1), "=a" (tmp2) - : [spec] "d" (0x810081UL) - : "cc", "memory", "0"); - return size; -} - -static inline unsigned long copy_in_user_mvc(void __user *to, const void __user *from, - unsigned long size) -{ - unsigned long tmp1; - - asm volatile( - " sacf 256\n" - " aghi %0,-1\n" - " jo 5f\n" - " bras %3,3f\n" - "0: aghi %0,257\n" - "1: mvc 0(1,%1),0(%2)\n" - " la %1,1(%1)\n" - " la %2,1(%2)\n" - " aghi %0,-1\n" - " jnz 1b\n" - " j 5f\n" - "2: mvc 0(256,%1),0(%2)\n" - " la %1,256(%1)\n" - " la %2,256(%2)\n" - "3: aghi %0,-256\n" - " jnm 2b\n" - "4: ex %0,1b-0b(%3)\n" - "5: slgr %0,%0\n" - "6: sacf 768\n" - EX_TABLE(1b,6b) EX_TABLE(2b,0b) EX_TABLE(4b,0b) - : "+a" (size), "+a" (to), "+a" (from), "=a" (tmp1) - : : "cc", "memory"); - return size; -} - -unsigned long raw_copy_in_user(void __user *to, const void __user *from, unsigned long n) -{ - if (copy_with_mvcos()) - return copy_in_user_mvcos(to, from, n); - return copy_in_user_mvc(to, from, n); -} -EXPORT_SYMBOL(raw_copy_in_user); - static inline unsigned long clear_user_mvcos(void __user *to, unsigned long size) { unsigned long tmp1, tmp2; diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h index 8b63410e830f..bd949fcf9d63 100644 --- a/arch/sparc/include/asm/compat.h +++ b/arch/sparc/include/asm/compat.h @@ -116,25 +116,6 @@ struct compat_statfs { #define COMPAT_OFF_T_MAX 0x7fffffff -#ifdef CONFIG_COMPAT -static inline void __user *arch_compat_alloc_user_space(long len) -{ - struct pt_regs *regs = current_thread_info()->kregs; - unsigned long usp = regs->u_regs[UREG_I6]; - - if (test_thread_64bit_stack(usp)) - usp += STACK_BIAS; - - if (test_thread_flag(TIF_32BIT)) - usp &= 0xffffffffUL; - - usp -= len; - usp &= ~0x7UL; - - return (void __user *) usp; -} -#endif - struct compat_ipc64_perm { compat_key_t key; __compat_uid32_t uid; diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 093849bfda50..d1cc410d2f64 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -455,7 +455,7 @@ static unsigned long clone_stackframe(unsigned long csp, unsigned long psp) distance = fp - psp; rval = (csp - distance); - if (copy_in_user((void __user *) rval, (void __user *) psp, distance)) + if (raw_copy_in_user((void __user *)rval, (void __user *)psp, distance)) rval = 0; else if (!stack_64bit) { if (put_user(((u32)csp), diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c index 4276b9e003ca..6cc124a3bb98 100644 --- a/arch/sparc/kernel/signal32.c +++ b/arch/sparc/kernel/signal32.c @@ -435,9 +435,9 @@ static int setup_frame32(struct ksignal *ksig, struct pt_regs *regs, (_COMPAT_NSIG_WORDS - 1) * sizeof(unsigned int)); if (!wsaved) { - err |= copy_in_user((u32 __user *)sf, - (u32 __user *)(regs->u_regs[UREG_FP]), - sizeof(struct reg_window32)); + err |= raw_copy_in_user((u32 __user *)sf, + (u32 __user *)(regs->u_regs[UREG_FP]), + sizeof(struct reg_window32)); } else { struct reg_window *rp; @@ -567,9 +567,9 @@ static int setup_rt_frame32(struct ksignal *ksig, struct pt_regs *regs, err |= put_compat_sigset(&sf->mask, oldset, sizeof(compat_sigset_t)); if (!wsaved) { - err |= copy_in_user((u32 __user *)sf, - (u32 __user *)(regs->u_regs[UREG_FP]), - sizeof(struct reg_window32)); + err |= raw_copy_in_user((u32 __user *)sf, + (u32 __user *)(regs->u_regs[UREG_FP]), + sizeof(struct reg_window32)); } else { struct reg_window *rp; diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c index cea23cf95600..2a78d2af1265 100644 --- a/arch/sparc/kernel/signal_64.c +++ b/arch/sparc/kernel/signal_64.c @@ -406,10 +406,10 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) err |= copy_to_user(&sf->mask, sigmask_to_save(), sizeof(sigset_t)); if (!wsaved) { - err |= copy_in_user((u64 __user *)sf, - (u64 __user *)(regs->u_regs[UREG_FP] + - STACK_BIAS), - sizeof(struct reg_window)); + err |= raw_copy_in_user((u64 __user *)sf, + (u64 __user *)(regs->u_regs[UREG_FP] + + STACK_BIAS), + sizeof(struct reg_window)); } else { struct reg_window *rp; diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 4ae01cdb99de..7516e4199b3c 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -156,19 +156,6 @@ struct compat_shmid64_ds { (!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)) #endif -static inline void __user *arch_compat_alloc_user_space(long len) -{ - compat_uptr_t sp = task_pt_regs(current)->sp; - - /* - * -128 for the x32 ABI redzone. For IA32, it is not strictly - * necessary, but not harmful. - */ - sp -= 128; - - return (void __user *)round_down(sp - len, 16); -} - static inline bool in_x32_syscall(void) { #ifdef CONFIG_X86_X32_ABI diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index e7265a552f4f..45697e04d771 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -58,13 +58,6 @@ raw_copy_to_user(void __user *dst, const void *src, unsigned long size) return copy_user_generic((__force void *)dst, src, size); } -static __always_inline __must_check -unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigned long size) -{ - return copy_user_generic((__force void *)dst, - (__force void *)src, size); -} - extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size, int zerorest); diff --git a/include/linux/compat.h b/include/linux/compat.h index 2d42cebd1fb8..1c758b0e0359 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -511,8 +511,6 @@ extern long compat_arch_ptrace(struct task_struct *child, compat_long_t request, struct epoll_event; /* fortunately, this one is fixed-layout */ -extern void __user *compat_alloc_user_space(unsigned long len); - int compat_restore_altstack(const compat_stack_t __user *uss); int __compat_save_altstack(compat_stack_t __user *, unsigned long); #define unsafe_compat_save_altstack(uss, sp, label) do { \ diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index c05e903cef02..ac0394087f7d 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -200,16 +200,6 @@ copy_to_user(void __user *to, const void *from, unsigned long n) n = _copy_to_user(to, from, n); return n; } -#ifdef CONFIG_COMPAT -static __always_inline unsigned long __must_check -copy_in_user(void __user *to, const void __user *from, unsigned long n) -{ - might_fault(); - if (access_ok(to, n) && access_ok(from, n)) - n = raw_copy_in_user(to, from, n); - return n; -} -#endif #ifndef copy_mc_to_kernel /* diff --git a/kernel/compat.c b/kernel/compat.c index 05adfd6fa8bf..55551989d9da 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -269,24 +269,3 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat) return 0; } EXPORT_SYMBOL_GPL(get_compat_sigset); - -/* - * Allocate user-space memory for the duration of a single system call, - * in order to marshall parameters inside a compat thunk. - */ -void __user *compat_alloc_user_space(unsigned long len) -{ - void __user *ptr; - - /* If len would occupy more than half of the entire compat space... */ - if (unlikely(len > (((compat_uptr_t)~0) >> 1))) - return NULL; - - ptr = arch_compat_alloc_user_space(len); - - if (unlikely(!access_ok(ptr, len))) - return NULL; - - return ptr; -} -EXPORT_SYMBOL_GPL(compat_alloc_user_space); From 4b42fb213678d2b6a9eeea92a9be200f23e49583 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Wed, 8 Sep 2021 18:10:02 -0700 Subject: [PATCH 11/18] mm/hmm: bypass devmap pte when all pfn requested flags are fulfilled Previously, we noticed the one rpma example was failed[1] since commit 36f30e486dce ("IB/core: Improve ODP to use hmm_range_fault()"), where it will use ODP feature to do RDMA WRITE between fsdax files. After digging into the code, we found hmm_vma_handle_pte() will still return EFAULT even though all the its requesting flags has been fulfilled. That's because a DAX page will be marked as (_PAGE_SPECIAL | PAGE_DEVMAP) by pte_mkdevmap(). Link: https://github.com/pmem/rpma/issues/1142 [1] Link: https://lkml.kernel.org/r/20210830094232.203029-1-lizhijian@cn.fujitsu.com Fixes: 405506274922 ("mm/hmm: add missing call to hmm_pte_need_fault in HMM_PFN_SPECIAL handling") Signed-off-by: Li Zhijian Reviewed-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hmm.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/hmm.c b/mm/hmm.c index fad6be2bf072..842e26599238 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -295,10 +295,13 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, goto fault; /* + * Bypass devmap pte such as DAX page when all pfn requested + * flags(pfn_req_flags) are fulfilled. * Since each architecture defines a struct page for the zero page, just * fall through and treat it like a normal page. */ - if (pte_special(pte) && !is_zero_pfn(pte_pfn(pte))) { + if (pte_special(pte) && !pte_devmap(pte) && + !is_zero_pfn(pte_pfn(pte))) { if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) { pte_unmap(ptep); return -EFAULT; From 13db8c50477d83ad3e3b9b0ae247e5cd833a7ae4 Mon Sep 17 00:00:00 2001 From: Liu Zixian Date: Wed, 8 Sep 2021 18:10:05 -0700 Subject: [PATCH 12/18] mm/hugetlb: initialize hugetlb_usage in mm_init After fork, the child process will get incorrect (2x) hugetlb_usage. If a process uses 5 2MB hugetlb pages in an anonymous mapping, HugetlbPages: 10240 kB and then forks, the child will show, HugetlbPages: 20480 kB The reason for double the amount is because hugetlb_usage will be copied from the parent and then increased when we copy page tables from parent to child. Child will have 2x actual usage. Fix this by adding hugetlb_count_init in mm_init. Link: https://lkml.kernel.org/r/20210826071742.877-1-liuzixian4@huawei.com Fixes: 5d317b2b6536 ("mm: hugetlb: proc: add HugetlbPages field to /proc/PID/status") Signed-off-by: Liu Zixian Reviewed-by: Naoya Horiguchi Reviewed-by: Mike Kravetz Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 9 +++++++++ kernel/fork.c | 1 + 2 files changed, 10 insertions(+) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index f7ca1a3870ea..1faebe1cd0ed 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -858,6 +858,11 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h, void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm); +static inline void hugetlb_count_init(struct mm_struct *mm) +{ + atomic_long_set(&mm->hugetlb_usage, 0); +} + static inline void hugetlb_count_add(long l, struct mm_struct *mm) { atomic_long_add(l, &mm->hugetlb_usage); @@ -1042,6 +1047,10 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h, return &mm->page_table_lock; } +static inline void hugetlb_count_init(struct mm_struct *mm) +{ +} + static inline void hugetlb_report_usage(struct seq_file *f, struct mm_struct *m) { } diff --git a/kernel/fork.c b/kernel/fork.c index ff5be23800af..38681ad44c76 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1063,6 +1063,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->pmd_huge_pte = NULL; #endif mm_init_uprobes_state(mm); + hugetlb_count_init(mm); if (current->mm) { mm->flags = current->mm->flags & MMF_INIT_MASK; From 32d4f4b782bb8f0ceb78c6b5dc46eb577ae25bf7 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 8 Sep 2021 18:10:08 -0700 Subject: [PATCH 13/18] mm,vmscan: fix divide by zero in get_scan_count Commit f56ce412a59d ("mm: memcontrol: fix occasional OOMs due to proportional memory.low reclaim") introduced a divide by zero corner case when oomd is being used in combination with cgroup memory.low protection. When oomd decides to kill a cgroup, it will force the cgroup memory to be reclaimed after killing the tasks, by writing to the memory.max file for that cgroup, forcing the remaining page cache and reclaimable slab to be reclaimed down to zero. Previously, on cgroups with some memory.low protection that would result in the memory being reclaimed down to the memory.low limit, or likely not at all, having the page cache reclaimed asynchronously later. With f56ce412a59d the oomd write to memory.max tries to reclaim all the way down to zero, which may race with another reclaimer, to the point of ending up with the divide by zero below. This patch implements the obvious fix. Link: https://lkml.kernel.org/r/20210826220149.058089c6@imladris.surriel.com Fixes: f56ce412a59d ("mm: memcontrol: fix occasional OOMs due to proportional memory.low reclaim") Signed-off-by: Rik van Riel Acked-by: Roman Gushchin Acked-by: Michal Hocko Acked-by: Johannes Weiner Acked-by: Chris Down Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 740d03e6dae2..74296c2d1fed 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2715,7 +2715,7 @@ out: cgroup_size = max(cgroup_size, protection); scan = lruvec_size - lruvec_size * protection / - cgroup_size; + (cgroup_size + 1); /* * Minimally target SWAP_CLUSTER_MAX pages to keep From 053cfda102306a3394012f9fe2594811c34925e4 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 8 Sep 2021 18:10:11 -0700 Subject: [PATCH 14/18] mm/page_alloc.c: avoid accessing uninitialized pcp page migratetype If it's not prepared to free unref page, the pcp page migratetype is unset. Thus we will get rubbish from get_pcppage_migratetype() and might list_del(&page->lru) again after it's already deleted from the list leading to grumble about data corruption. Link: https://lkml.kernel.org/r/20210902115447.57050-1-linmiaohe@huawei.com Fixes: df1acc856923 ("mm/page_alloc: avoid conflating IRQs disabled with zone->lock") Signed-off-by: Miaohe Lin Acked-by: Mel Gorman Acked-by: Vlastimil Babka Reviewed-by: David Hildenbrand Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index de309a1dfe65..b37435c274cf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3428,8 +3428,10 @@ void free_unref_page_list(struct list_head *list) /* Prepare pages for freeing */ list_for_each_entry_safe(page, next, list, lru) { pfn = page_to_pfn(page); - if (!free_unref_page_prepare(page, pfn, 0)) + if (!free_unref_page_prepare(page, pfn, 0)) { list_del(&page->lru); + continue; + } /* * Free isolated pages directly to the allocator, see From 10994316089c9682f2fbe0be0b1e82bcaf5f4e8c Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Wed, 8 Sep 2021 18:10:14 -0700 Subject: [PATCH 15/18] mmap_lock: change trace and locking order Print to the trace log before releasing the lock to avoid racing with other trace log printers of the same lock type. Link: https://lkml.kernel.org/r/20210903022041.1843024-1-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Suggested-by: Steven Rostedt (VMware) Reviewed-by: Matthew Wilcox (Oracle) Cc: Michel Lespinasse Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmap_lock.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 0540f0156f58..b179f1e3541a 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -101,14 +101,14 @@ static inline bool mmap_write_trylock(struct mm_struct *mm) static inline void mmap_write_unlock(struct mm_struct *mm) { - up_write(&mm->mmap_lock); __mmap_lock_trace_released(mm, true); + up_write(&mm->mmap_lock); } static inline void mmap_write_downgrade(struct mm_struct *mm) { - downgrade_write(&mm->mmap_lock); __mmap_lock_trace_acquire_returned(mm, false, true); + downgrade_write(&mm->mmap_lock); } static inline void mmap_read_lock(struct mm_struct *mm) @@ -140,8 +140,8 @@ static inline bool mmap_read_trylock(struct mm_struct *mm) static inline void mmap_read_unlock(struct mm_struct *mm) { - up_read(&mm->mmap_lock); __mmap_lock_trace_released(mm, false); + up_read(&mm->mmap_lock); } static inline bool mmap_read_trylock_non_owner(struct mm_struct *mm) @@ -155,8 +155,8 @@ static inline bool mmap_read_trylock_non_owner(struct mm_struct *mm) static inline void mmap_read_unlock_non_owner(struct mm_struct *mm) { - up_read_non_owner(&mm->mmap_lock); __mmap_lock_trace_released(mm, false); + up_read_non_owner(&mm->mmap_lock); } static inline void mmap_assert_locked(struct mm_struct *mm) From 79d3705040c3b41075f894fdeeebdcbb46550c63 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Wed, 8 Sep 2021 18:10:17 -0700 Subject: [PATCH 16/18] mm/kmemleak: allow __GFP_NOLOCKDEP passed to kmemleak's gfp In a memory pressure situation, I'm seeing the lockdep WARNING below. Actually, this is similar to a known false positive which is already addressed by commit 6dcde60efd94 ("xfs: more lockdep whackamole with kmem_alloc*"). This warning still persists because it's not from kmalloc() itself but from an allocation for kmemleak object. While kmalloc() itself suppress the warning with __GFP_NOLOCKDEP, gfp_kmemleak_mask() is dropping the flag for the kmemleak's allocation. Allow __GFP_NOLOCKDEP to be passed to kmemleak's allocation, so that the warning for it is also suppressed. ====================================================== WARNING: possible circular locking dependency detected 5.14.0-rc7-BTRFS-ZNS+ #37 Not tainted ------------------------------------------------------ kswapd0/288 is trying to acquire lock: ffff88825ab45df0 (&xfs_nondir_ilock_class){++++}-{3:3}, at: xfs_ilock+0x8a/0x250 but task is already holding lock: ffffffff848cc1e0 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (fs_reclaim){+.+.}-{0:0}: fs_reclaim_acquire+0x112/0x160 kmem_cache_alloc+0x48/0x400 create_object.isra.0+0x42/0xb10 kmemleak_alloc+0x48/0x80 __kmalloc+0x228/0x440 kmem_alloc+0xd3/0x2b0 kmem_alloc_large+0x5a/0x1c0 xfs_attr_copy_value+0x112/0x190 xfs_attr_shortform_getvalue+0x1fc/0x300 xfs_attr_get_ilocked+0x125/0x170 xfs_attr_get+0x329/0x450 xfs_get_acl+0x18d/0x430 get_acl.part.0+0xb6/0x1e0 posix_acl_xattr_get+0x13a/0x230 vfs_getxattr+0x21d/0x270 getxattr+0x126/0x310 __x64_sys_fgetxattr+0x1a6/0x2a0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae -> #0 (&xfs_nondir_ilock_class){++++}-{3:3}: __lock_acquire+0x2c0f/0x5a00 lock_acquire+0x1a1/0x4b0 down_read_nested+0x50/0x90 xfs_ilock+0x8a/0x250 xfs_can_free_eofblocks+0x34f/0x570 xfs_inactive+0x411/0x520 xfs_fs_destroy_inode+0x2c8/0x710 destroy_inode+0xc5/0x1a0 evict+0x444/0x620 dispose_list+0xfe/0x1c0 prune_icache_sb+0xdc/0x160 super_cache_scan+0x31e/0x510 do_shrink_slab+0x337/0x8e0 shrink_slab+0x362/0x5c0 shrink_node+0x7a7/0x1a40 balance_pgdat+0x64e/0xfe0 kswapd+0x590/0xa80 kthread+0x38c/0x460 ret_from_fork+0x22/0x30 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(&xfs_nondir_ilock_class); lock(fs_reclaim); lock(&xfs_nondir_ilock_class); *** DEADLOCK *** 3 locks held by kswapd0/288: #0: ffffffff848cc1e0 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 #1: ffffffff848a08d8 (shrinker_rwsem){++++}-{3:3}, at: shrink_slab+0x269/0x5c0 #2: ffff8881a7a820e8 (&type->s_umount_key#60){++++}-{3:3}, at: super_cache_scan+0x5a/0x510 Link: https://lkml.kernel.org/r/20210907055659.3182992-1-naohiro.aota@wdc.com Signed-off-by: Naohiro Aota Acked-by: Catalin Marinas Cc: "Darrick J . Wong" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index b59f1761d817..b57383c17cf6 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -113,7 +113,8 @@ #define BYTES_PER_POINTER sizeof(void *) /* GFP bitmask for kmemleak internal allocations */ -#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \ +#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC | \ + __GFP_NOLOCKDEP)) | \ __GFP_NORETRY | __GFP_NOMEMALLOC | \ __GFP_NOWARN) From 276aeee1c5fc00df700f0782060beae126600472 Mon Sep 17 00:00:00 2001 From: yanghui Date: Wed, 8 Sep 2021 18:10:20 -0700 Subject: [PATCH 17/18] mm/mempolicy: fix a race between offset_il_node and mpol_rebind_task Servers happened below panic: Kernel version:5.4.56 BUG: unable to handle page fault for address: 0000000000002c48 RIP: 0010:__next_zones_zonelist+0x1d/0x40 Call Trace: __alloc_pages_nodemask+0x277/0x310 alloc_page_interleave+0x13/0x70 handle_mm_fault+0xf99/0x1390 __do_page_fault+0x288/0x500 do_page_fault+0x30/0x110 page_fault+0x3e/0x50 The reason for the panic is that MAX_NUMNODES is passed in the third parameter in __alloc_pages_nodemask(preferred_nid). So access to zonelist->zoneref->zone_idx in __next_zones_zonelist will cause a panic. In offset_il_node(), first_node() returns nid from pol->v.nodes, after this other threads may chang pol->v.nodes before next_node(). This race condition will let next_node return MAX_NUMNODES. So put pol->nodes in a local variable. The race condition is between offset_il_node and cpuset_change_task_nodemask: CPU0: CPU1: alloc_pages_vma() interleave_nid(pol,) offset_il_node(pol,) first_node(pol->v.nodes) cpuset_change_task_nodemask //nodes==0xc mpol_rebind_task mpol_rebind_policy mpol_rebind_nodemask(pol,nodes) //nodes==0x3 next_node(nid, pol->v.nodes)//return MAX_NUMNODES Link: https://lkml.kernel.org/r/20210906034658.48721-1-yanghui.def@bytedance.com Signed-off-by: yanghui Reviewed-by: Muchun Song Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 5e90b3fb7794..b44f54768766 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1979,17 +1979,26 @@ unsigned int mempolicy_slab_node(void) */ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n) { - unsigned nnodes = nodes_weight(pol->nodes); - unsigned target; + nodemask_t nodemask = pol->nodes; + unsigned int target, nnodes; int i; int nid; + /* + * The barrier will stabilize the nodemask in a register or on + * the stack so that it will stop changing under the code. + * + * Between first_node() and next_node(), pol->nodes could be changed + * by other threads. So we put pol->nodes in a local stack. + */ + barrier(); + nnodes = nodes_weight(nodemask); if (!nnodes) return numa_node_id(); target = (unsigned int)n % nnodes; - nid = first_node(pol->nodes); + nid = first_node(nodemask); for (i = 0; i < target; i++) - nid = next_node(nid, pol->nodes); + nid = next_node(nid, nodemask); return nid; } From ddb13122aa7e988e15283701afb086e0950c405f Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 8 Sep 2021 18:10:23 -0700 Subject: [PATCH 18/18] nds32/setup: remove unused memblock_region variable in setup_memory() kernel test robot reports unused variable warning: arch/nds32/kernel/setup.c:247:26: warning: Unused variable: region [unusedVariable] struct memblock_region *region; ^ Remove the unused variable. Link: https://lkml.kernel.org/r/20210712125218.28951-1-rppt@kernel.org Signed-off-by: Mike Rapoport Reported-by: kernel test robot Reviewed-by: Guenter Roeck Tested-by: Guenter Roeck Cc: Greentime Hu Cc: Nick Hu Cc: Vincent Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/nds32/kernel/setup.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/nds32/kernel/setup.c b/arch/nds32/kernel/setup.c index 41725eaf8bac..b3d34d646652 100644 --- a/arch/nds32/kernel/setup.c +++ b/arch/nds32/kernel/setup.c @@ -244,7 +244,6 @@ static void __init setup_memory(void) unsigned long ram_start_pfn; unsigned long free_ram_start_pfn; phys_addr_t memory_start, memory_end; - struct memblock_region *region; memory_end = memory_start = 0;