Merge branch 'akpm' (patches from Andrew)

Mergr misc fixes from Andrew Morton: "11 fixes" Mostly VM fixes, one psi polling fix, and one parisc build fix. * emailed patches from Andrew Morton <akpm@linux-foundation.org>: mm/kasan: fix false positive invalid-free reports with CONFIG_KASAN_SW_TAGS=y mm/zsmalloc.c: fix race condition in zs_destroy_pool mm/zsmalloc.c: migration can leave pages in ZS_EMPTY indefinitely mm, page_owner: handle THP splits correctly userfaultfd_release: always remove uffd flags and clear vm_userfaultfd_ctx psi: get poll_work to run when calling poll syscall next time mm: memcontrol: flush percpu vmevents before releasing memcg mm: memcontrol: flush percpu vmstats before releasing memcg parisc: fix compilation errrors mm, page_alloc: move_freepages should not examine struct page of reserved memory mm/z3fold.c: fix race between migration and destruction
2019-08-25 09:56:27 -07:00 · 2019-08-25 09:56:27 -07:00 · f47edb59bb
parent e67095fd2f 00fb24a42a
commit f47edb59bb
9 changed files with 260 additions and 36 deletions
--- a/arch/parisc/include/asm/pgtable.h
+++ b/arch/parisc/include/asm/pgtable.h
@ -2,6 +2,7 @@
 #ifndef _PARISC_PGTABLE_H
 #define _PARISC_PGTABLE_H
 #include <asm/page.h>
 #include <asm-generic/4level-fixup.h>
 #include <asm/fixmap.h>
@ -98,8 +99,6 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr)
 #endif /* !__ASSEMBLY__ */
 #include <asm/page.h>
 #define pte_ERROR(e) \
 	printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e))
 #define pmd_ERROR(e) \
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@ -880,6 +880,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
 	/* len == 0 means wake all */
 	struct userfaultfd_wake_range range = { .len = 0, };
 	unsigned long new_flags;
 	bool still_valid;
 	WRITE_ONCE(ctx->released, true);
@ -895,8 +896,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
 	 * taking the mmap_sem for writing.
 	 */
 	down_write(&mm->mmap_sem);
-	if (!mmget_still_valid(mm))
+	still_valid = mmget_still_valid(mm);
 		goto skip_mm;
 	prev = NULL;
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		cond_resched();
@ -907,19 +907,20 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
 			continue;
 		}
 		new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
-		prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
+		if (still_valid) {
-				 new_flags, vma->anon_vma,
+			prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
-				 vma->vm_file, vma->vm_pgoff,
+					 new_flags, vma->anon_vma,
-				 vma_policy(vma),
+					 vma->vm_file, vma->vm_pgoff,
-				 NULL_VM_UFFD_CTX);
+					 vma_policy(vma),
-		if (prev)
+					 NULL_VM_UFFD_CTX);
-			vma = prev;
+			if (prev)
-		else
+				vma = prev;
-			prev = vma;
+			else
 				prev = vma;
 		}
 		vma->vm_flags = new_flags;
 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 	}
 skip_mm:
 	up_write(&mm->mmap_sem);
 	mmput(mm);
 wakeup:
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@ -1131,7 +1131,15 @@ static void psi_trigger_destroy(struct kref *ref)
 	 * deadlock while waiting for psi_poll_work to acquire trigger_lock
 	 */
 	if (kworker_to_destroy) {
 		/*
 		 * After the RCU grace period has expired, the worker
 		 * can no longer be found through group->poll_kworker.
 		 * But it might have been already scheduled before
 		 * that - deschedule it cleanly before destroying it.
 		 */
 		kthread_cancel_delayed_work_sync(&group->poll_work);
 		atomic_set(&group->poll_scheduled, 0);
 		kthread_destroy_worker(kworker_to_destroy);
 	}
 	kfree(t);
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@ -32,6 +32,7 @@
 #include <linux/shmem_fs.h>
 #include <linux/oom.h>
 #include <linux/numa.h>
 #include <linux/page_owner.h>
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
@ -2516,6 +2517,9 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 	}
 	ClearPageCompound(head);
 	split_page_owner(head, HPAGE_PMD_ORDER);
 	/* See comment in __split_huge_page_tail() */
 	if (PageAnon(head)) {
 		/* Additional pin to swap cache */
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@ -407,8 +407,14 @@ static inline bool shadow_invalid(u8 tag, s8 shadow_byte)
 	if (IS_ENABLED(CONFIG_KASAN_GENERIC))
 		return shadow_byte < 0 ||
 			shadow_byte >= KASAN_SHADOW_SCALE_SIZE;
-	else
+
-		return tag != (u8)shadow_byte;
+	/* else CONFIG_KASAN_SW_TAGS: */
 	if ((u8)shadow_byte == KASAN_TAG_INVALID)
 		return true;
 	if ((tag != KASAN_TAG_KERNEL) && (tag != (u8)shadow_byte))
 		return true;
 	return false;
 }
 static bool __kasan_slab_free(struct kmem_cache *cache, void *object,
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@ -3260,6 +3260,60 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
 	}
 }
 static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
 {
 	unsigned long stat[MEMCG_NR_STAT];
 	struct mem_cgroup *mi;
 	int node, cpu, i;
 	for (i = 0; i < MEMCG_NR_STAT; i++)
 		stat[i] = 0;
 	for_each_online_cpu(cpu)
 		for (i = 0; i < MEMCG_NR_STAT; i++)
 			stat[i] += raw_cpu_read(memcg->vmstats_percpu->stat[i]);
 	for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
 		for (i = 0; i < MEMCG_NR_STAT; i++)
 			atomic_long_add(stat[i], &mi->vmstats[i]);
 	for_each_node(node) {
 		struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
 		struct mem_cgroup_per_node *pi;
 		for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
 			stat[i] = 0;
 		for_each_online_cpu(cpu)
 			for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
 				stat[i] += raw_cpu_read(
 					pn->lruvec_stat_cpu->count[i]);
 		for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
 			for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
 				atomic_long_add(stat[i], &pi->lruvec_stat[i]);
 	}
 }
 static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
 {
 	unsigned long events[NR_VM_EVENT_ITEMS];
 	struct mem_cgroup *mi;
 	int cpu, i;
 	for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
 		events[i] = 0;
 	for_each_online_cpu(cpu)
 		for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
 			events[i] += raw_cpu_read(
 				memcg->vmstats_percpu->events[i]);
 	for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
 		for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
 			atomic_long_add(events[i], &mi->vmevents[i]);
 }
 #ifdef CONFIG_MEMCG_KMEM
 static int memcg_online_kmem(struct mem_cgroup *memcg)
 {
@ -4682,6 +4736,12 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 {
 	int node;
 	/*
 	 * Flush percpu vmstats and vmevents to guarantee the value correctness
 	 * on parent's and all ancestor levels.
 	 */
 	memcg_flush_percpu_vmstats(memcg);
 	memcg_flush_percpu_vmevents(memcg);
 	for_each_node(node)
 		free_mem_cgroup_per_node_info(memcg, node);
 	free_percpu(memcg->vmstats_percpu);
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@ -2238,27 +2238,12 @@ static int move_freepages(struct zone *zone,
 	unsigned int order;
 	int pages_moved = 0;
 #ifndef CONFIG_HOLES_IN_ZONE
 	/*
 	 * page_zone is not safe to call in this context when
 	 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
 	 * anyway as we check zone boundaries in move_freepages_block().
 	 * Remove at a later date when no bug reports exist related to
 	 * grouping pages by mobility
 	 */
 	VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) &&
 	          pfn_valid(page_to_pfn(end_page)) &&
 	          page_zone(start_page) != page_zone(end_page));
 #endif
 	for (page = start_page; page <= end_page;) {
 		if (!pfn_valid_within(page_to_pfn(page))) {
 			page++;
 			continue;
 		}
 		/* Make sure we are not inadvertently changing nodes */
 		VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
 		if (!PageBuddy(page)) {
 			/*
 			 * We assume that pages that could be isolated for
@ -2273,6 +2258,10 @@ static int move_freepages(struct zone *zone,
 			continue;
 		}
 		/* Make sure we are not inadvertently changing nodes */
 		VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
 		VM_BUG_ON_PAGE(page_zone(page) != zone, page);
 		order = page_order(page);
 		move_to_free_area(page, &zone->free_area[order], migratetype);
 		page += 1 << order;
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@ -41,6 +41,7 @@
 #include <linux/workqueue.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/wait.h>
 #include <linux/zpool.h>
 #include <linux/magic.h>
@ -145,6 +146,8 @@ struct z3fold_header {
 * @release_wq:	workqueue for safe page release
 * @work:	work_struct for safe page release
 * @inode:	inode for z3fold pseudo filesystem
 * @destroying: bool to stop migration once we start destruction
 * @isolated: int to count the number of pages currently in isolation
 *
 * This structure is allocated at pool creation time and maintains metadata
 * pertaining to a particular z3fold pool.
@ -163,8 +166,11 @@ struct z3fold_pool {
 	const struct zpool_ops *zpool_ops;
 	struct workqueue_struct *compact_wq;
 	struct workqueue_struct *release_wq;
 	struct wait_queue_head isolate_wait;
 	struct work_struct work;
 	struct inode *inode;
 	bool destroying;
 	int isolated;
 };
 /*
@ -769,6 +775,7 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
 		goto out_c;
 	spin_lock_init(&pool->lock);
 	spin_lock_init(&pool->stale_lock);
 	init_waitqueue_head(&pool->isolate_wait);
 	pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2);
 	if (!pool->unbuddied)
 		goto out_pool;
@ -808,6 +815,15 @@ out:
 	return NULL;
 }
 static bool pool_isolated_are_drained(struct z3fold_pool *pool)
 {
 	bool ret;
 	spin_lock(&pool->lock);
 	ret = pool->isolated == 0;
 	spin_unlock(&pool->lock);
 	return ret;
 }
 /**
 * z3fold_destroy_pool() - destroys an existing z3fold pool
 * @pool:	the z3fold pool to be destroyed
@ -817,6 +833,22 @@ out:
 static void z3fold_destroy_pool(struct z3fold_pool *pool)
 {
 	kmem_cache_destroy(pool->c_handle);
 	/*
 	 * We set pool-> destroying under lock to ensure that
 	 * z3fold_page_isolate() sees any changes to destroying. This way we
 	 * avoid the need for any memory barriers.
 	 */
 	spin_lock(&pool->lock);
 	pool->destroying = true;
 	spin_unlock(&pool->lock);
 	/*
 	 * We need to ensure that no pages are being migrated while we destroy
 	 * these workqueues, as migration can queue work on either of the
 	 * workqueues.
 	 */
 	wait_event(pool->isolate_wait, !pool_isolated_are_drained(pool));
 	/*
 	 * We need to destroy pool->compact_wq before pool->release_wq,
@ -1307,6 +1339,28 @@ static u64 z3fold_get_pool_size(struct z3fold_pool *pool)
 	return atomic64_read(&pool->pages_nr);
 }
 /*
 * z3fold_dec_isolated() expects to be called while pool->lock is held.
 */
 static void z3fold_dec_isolated(struct z3fold_pool *pool)
 {
 	assert_spin_locked(&pool->lock);
 	VM_BUG_ON(pool->isolated <= 0);
 	pool->isolated--;
 	/*
 	 * If we have no more isolated pages, we have to see if
 	 * z3fold_destroy_pool() is waiting for a signal.
 	 */
 	if (pool->isolated == 0 && waitqueue_active(&pool->isolate_wait))
 		wake_up_all(&pool->isolate_wait);
 }
 static void z3fold_inc_isolated(struct z3fold_pool *pool)
 {
 	pool->isolated++;
 }
 static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
 {
 	struct z3fold_header *zhdr;
@ -1333,6 +1387,33 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
 		spin_lock(&pool->lock);
 		if (!list_empty(&page->lru))
 			list_del(&page->lru);
 		/*
 		 * We need to check for destruction while holding pool->lock, as
 		 * otherwise destruction could see 0 isolated pages, and
 		 * proceed.
 		 */
 		if (unlikely(pool->destroying)) {
 			spin_unlock(&pool->lock);
 			/*
 			 * If this page isn't stale, somebody else holds a
 			 * reference to it. Let't drop our refcount so that they
 			 * can call the release logic.
 			 */
 			if (unlikely(kref_put(&zhdr->refcount,
 					      release_z3fold_page_locked))) {
 				/*
 				 * If we get here we have kref problems, so we
 				 * should freak out.
 				 */
 				WARN(1, "Z3fold is experiencing kref problems\n");
 				return false;
 			}
 			z3fold_page_unlock(zhdr);
 			return false;
 		}
 		z3fold_inc_isolated(pool);
 		spin_unlock(&pool->lock);
 		z3fold_page_unlock(zhdr);
 		return true;
@ -1401,6 +1482,10 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
 	queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
 	spin_lock(&pool->lock);
 	z3fold_dec_isolated(pool);
 	spin_unlock(&pool->lock);
 	page_mapcount_reset(page);
 	put_page(page);
 	return 0;
@ -1420,10 +1505,14 @@ static void z3fold_page_putback(struct page *page)
 	INIT_LIST_HEAD(&page->lru);
 	if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
 		atomic64_dec(&pool->pages_nr);
 		spin_lock(&pool->lock);
 		z3fold_dec_isolated(pool);
 		spin_unlock(&pool->lock);
 		return;
 	}
 	spin_lock(&pool->lock);
 	list_add(&page->lru, &pool->lru);
 	z3fold_dec_isolated(pool);
 	spin_unlock(&pool->lock);
 	z3fold_page_unlock(zhdr);
 }
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@ -54,6 +54,7 @@
 #include <linux/mount.h>
 #include <linux/pseudo_fs.h>
 #include <linux/migrate.h>
 #include <linux/wait.h>
 #include <linux/pagemap.h>
 #include <linux/fs.h>
@ -268,6 +269,10 @@ struct zs_pool {
 #ifdef CONFIG_COMPACTION
 	struct inode *inode;
 	struct work_struct free_work;
 	/* A wait queue for when migration races with async_free_zspage() */
 	struct wait_queue_head migration_wait;
 	atomic_long_t isolated_pages;
 	bool destroying;
 #endif
 };
@ -1862,6 +1867,31 @@ static void dec_zspage_isolation(struct zspage *zspage)
 	zspage->isolated--;
 }
 static void putback_zspage_deferred(struct zs_pool *pool,
 				    struct size_class *class,
 				    struct zspage *zspage)
 {
 	enum fullness_group fg;
 	fg = putback_zspage(class, zspage);
 	if (fg == ZS_EMPTY)
 		schedule_work(&pool->free_work);
 }
 static inline void zs_pool_dec_isolated(struct zs_pool *pool)
 {
 	VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0);
 	atomic_long_dec(&pool->isolated_pages);
 	/*
 	 * There's no possibility of racing, since wait_for_isolated_drain()
 	 * checks the isolated count under &class->lock after enqueuing
 	 * on migration_wait.
 	 */
 	if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying)
 		wake_up_all(&pool->migration_wait);
 }
 static void replace_sub_page(struct size_class *class, struct zspage *zspage,
 				struct page *newpage, struct page *oldpage)
 {
@ -1931,6 +1961,7 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
 	 */
 	if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
 		get_zspage_mapping(zspage, &class_idx, &fullness);
 		atomic_long_inc(&pool->isolated_pages);
 		remove_zspage(class, zspage, fullness);
 	}
@ -2030,8 +2061,16 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
 	 * Page migration is done so let's putback isolated zspage to
 	 * the list if @page is final isolated subpage in the zspage.
 	 */
-	if (!is_zspage_isolated(zspage))
+	if (!is_zspage_isolated(zspage)) {
-		putback_zspage(class, zspage);
+		/*
 		 * We cannot race with zs_destroy_pool() here because we wait
 		 * for isolation to hit zero before we start destroying.
 		 * Also, we ensure that everyone can see pool->destroying before
 		 * we start waiting.
 		 */
 		putback_zspage_deferred(pool, class, zspage);
 		zs_pool_dec_isolated(pool);
 	}
 	reset_page(page);
 	put_page(page);
@ -2077,13 +2116,12 @@ static void zs_page_putback(struct page *page)
 	spin_lock(&class->lock);
 	dec_zspage_isolation(zspage);
 	if (!is_zspage_isolated(zspage)) {
 		fg = putback_zspage(class, zspage);
 		/*
 		 * Due to page_lock, we cannot free zspage immediately
 		 * so let's defer.
 		 */
-		if (fg == ZS_EMPTY)
+		putback_zspage_deferred(pool, class, zspage);
-			schedule_work(&pool->free_work);
+		zs_pool_dec_isolated(pool);
 	}
 	spin_unlock(&class->lock);
 }
@ -2107,8 +2145,36 @@ static int zs_register_migration(struct zs_pool *pool)
 	return 0;
 }
 static bool pool_isolated_are_drained(struct zs_pool *pool)
 {
 	return atomic_long_read(&pool->isolated_pages) == 0;
 }
 /* Function for resolving migration */
 static void wait_for_isolated_drain(struct zs_pool *pool)
 {
 	/*
 	 * We're in the process of destroying the pool, so there are no
 	 * active allocations. zs_page_isolate() fails for completely free
 	 * zspages, so we need only wait for the zs_pool's isolated
 	 * count to hit zero.
 	 */
 	wait_event(pool->migration_wait,
 		   pool_isolated_are_drained(pool));
 }
 static void zs_unregister_migration(struct zs_pool *pool)
 {
 	pool->destroying = true;
 	/*
 	 * We need a memory barrier here to ensure global visibility of
 	 * pool->destroying. Thus pool->isolated pages will either be 0 in which
 	 * case we don't care, or it will be > 0 and pool->destroying will
 	 * ensure that we wake up once isolation hits 0.
 	 */
 	smp_mb();
 	wait_for_isolated_drain(pool); /* This can block */
 	flush_work(&pool->free_work);
 	iput(pool->inode);
 }
@ -2346,6 +2412,8 @@ struct zs_pool *zs_create_pool(const char *name)
 	if (!pool->name)
 		goto err;
 	init_waitqueue_head(&pool->migration_wait);
 	if (create_cache(pool))
 		goto err;