From c97102ba96324da330078ad8619ba4dfe840dbe3 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 18 Dec 2013 17:08:31 -0800 Subject: [PATCH 01/24] kexec: migrate to reboot cpu Commit 1b3a5d02ee07 ("reboot: move arch/x86 reboot= handling to generic kernel") moved reboot= handling to generic code. In the process it also removed the code in native_machine_shutdown() which are moving reboot process to reboot_cpu/cpu0. I guess that thought must have been that all reboot paths are calling migrate_to_reboot_cpu(), so we don't need this special handling. But kexec reboot path (kernel_kexec()) is not calling migrate_to_reboot_cpu() so above change broke kexec. Now reboot can happen on non-boot cpu and when INIT is sent in second kerneo to bring up BP, it brings down the machine. So start calling migrate_to_reboot_cpu() in kexec reboot path to avoid this problem. Bisected by WANG Chao. Reported-by: Matthew Whitehead Reported-by: Dave Young Signed-off-by: Vivek Goyal Tested-by: Baoquan He Tested-by: WANG Chao Acked-by: H. Peter Anvin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/reboot.h | 1 + kernel/kexec.c | 1 + kernel/reboot.c | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 8e00f9f6f963..9e7db9e73cc1 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -43,6 +43,7 @@ extern int unregister_reboot_notifier(struct notifier_block *); * Architecture-specific implementations of sys_reboot commands. */ +extern void migrate_to_reboot_cpu(void); extern void machine_restart(char *cmd); extern void machine_halt(void); extern void machine_power_off(void); diff --git a/kernel/kexec.c b/kernel/kexec.c index d0d8fca54065..9c970167e402 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1680,6 +1680,7 @@ int kernel_kexec(void) { kexec_in_progress = true; kernel_restart_prepare(NULL); + migrate_to_reboot_cpu(); printk(KERN_EMERG "Starting new kernel\n"); machine_shutdown(); } diff --git a/kernel/reboot.c b/kernel/reboot.c index f813b3474646..662c83fc16b7 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -104,7 +104,7 @@ int unregister_reboot_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_reboot_notifier); -static void migrate_to_reboot_cpu(void) +void migrate_to_reboot_cpu(void) { /* The boot cpu is always logical cpu 0 */ int cpu = reboot_cpu; From 2b4847e73004c10ae6666c2e27b5c5430aed8698 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 18 Dec 2013 17:08:32 -0800 Subject: [PATCH 02/24] mm: numa: serialise parallel get_user_page against THP migration Base pages are unmapped and flushed from cache and TLB during normal page migration and replaced with a migration entry that causes any parallel NUMA hinting fault or gup to block until migration completes. THP does not unmap pages due to a lack of support for migration entries at a PMD level. This allows races with get_user_pages and get_user_pages_fast which commit 3f926ab945b6 ("mm: Close races between THP migration and PMD numa clearing") made worse by introducing a pmd_clear_flush(). This patch forces get_user_page (fast and normal) on a pmd_numa page to go through the slow get_user_page path where it will serialise against THP migration and properly account for the NUMA hinting fault. On the migration side the page table lock is taken for each PTE update. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Alex Thorlton Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/gup.c | 13 +++++++++++++ mm/huge_memory.c | 24 ++++++++++++++++-------- mm/migrate.c | 38 +++++++++++++++++++++++++++++++------- 3 files changed, 60 insertions(+), 15 deletions(-) diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index dd74e46828c0..0596e8e0cc19 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -83,6 +83,12 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, pte_t pte = gup_get_pte(ptep); struct page *page; + /* Similar to the PMD case, NUMA hinting must take slow path */ + if (pte_numa(pte)) { + pte_unmap(ptep); + return 0; + } + if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { pte_unmap(ptep); return 0; @@ -167,6 +173,13 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, if (pmd_none(pmd) || pmd_trans_splitting(pmd)) return 0; if (unlikely(pmd_large(pmd))) { + /* + * NUMA hinting faults need to be handled in the GUP + * slowpath for accounting purposes and so that they + * can be serialised against THP migration. + */ + if (pmd_numa(pmd)) + return 0; if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) return 0; } else { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 33a5dc492810..51f069303ab9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1243,6 +1243,10 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) return ERR_PTR(-EFAULT); + /* Full NUMA hinting faults to serialise migration in fault paths */ + if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) + goto out; + page = pmd_page(*pmd); VM_BUG_ON(!PageHead(page)); if (flags & FOLL_TOUCH) { @@ -1323,23 +1327,27 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* If the page was locked, there are no parallel migrations */ if (page_locked) goto clear_pmdnuma; + } - /* - * Otherwise wait for potential migrations and retry. We do - * relock and check_same as the page may no longer be mapped. - * As the fault is being retried, do not account for it. - */ + /* + * If there are potential migrations, wait for completion and retry. We + * do not relock and check_same as the page may no longer be mapped. + * Furtermore, even if the page is currently misplaced, there is no + * guarantee it is still misplaced after the migration completes. + */ + if (!page_locked) { spin_unlock(ptl); wait_on_page_locked(page); page_nid = -1; goto out; } - /* Page is misplaced, serialise migrations and parallel THP splits */ + /* + * Page is misplaced. Page lock serialises migrations. Acquire anon_vma + * to serialises splits + */ get_page(page); spin_unlock(ptl); - if (!page_locked) - lock_page(page); anon_vma = page_lock_anon_vma_read(page); /* Confirm the PMD did not change while page_table_lock was released */ diff --git a/mm/migrate.c b/mm/migrate.c index bb940045fe85..2cabbd5fa5bf 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1722,6 +1722,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, struct page *new_page = NULL; struct mem_cgroup *memcg = NULL; int page_lru = page_is_file_cache(page); + pmd_t orig_entry; /* * Rate-limit the amount of data that is being migrated to a node. @@ -1756,7 +1757,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, /* Recheck the target PMD */ ptl = pmd_lock(mm, pmd); - if (unlikely(!pmd_same(*pmd, entry))) { + if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) { +fail_putback: spin_unlock(ptl); /* Reverse changes made by migrate_page_copy() */ @@ -1786,16 +1788,34 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, */ mem_cgroup_prepare_migration(page, new_page, &memcg); + orig_entry = *pmd; entry = mk_pmd(new_page, vma->vm_page_prot); - entry = pmd_mknonnuma(entry); - entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); entry = pmd_mkhuge(entry); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + /* + * Clear the old entry under pagetable lock and establish the new PTE. + * Any parallel GUP will either observe the old page blocking on the + * page lock, block on the page table lock or observe the new page. + * The SetPageUptodate on the new page and page_add_new_anon_rmap + * guarantee the copy is visible before the pagetable update. + */ + flush_cache_range(vma, haddr, haddr + HPAGE_PMD_SIZE); + page_add_new_anon_rmap(new_page, vma, haddr); pmdp_clear_flush(vma, haddr, pmd); set_pmd_at(mm, haddr, pmd, entry); - page_add_new_anon_rmap(new_page, vma, haddr); update_mmu_cache_pmd(vma, address, &entry); + + if (page_count(page) != 2) { + set_pmd_at(mm, haddr, pmd, orig_entry); + flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE); + update_mmu_cache_pmd(vma, address, &entry); + page_remove_rmap(new_page); + goto fail_putback; + } + page_remove_rmap(page); + /* * Finish the charge transaction under the page table lock to * prevent split_huge_page() from dividing up the charge @@ -1820,9 +1840,13 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, out_fail: count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); out_dropref: - entry = pmd_mknonnuma(entry); - set_pmd_at(mm, haddr, pmd, entry); - update_mmu_cache_pmd(vma, address, &entry); + ptl = pmd_lock(mm, pmd); + if (pmd_same(*pmd, entry)) { + entry = pmd_mknonnuma(entry); + set_pmd_at(mm, haddr, pmd, entry); + update_mmu_cache_pmd(vma, address, &entry); + } + spin_unlock(ptl); unlock_page(page); put_page(page); From f714f4f20e59ea6eea264a86b9a51fd51b88fc54 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 18 Dec 2013 17:08:33 -0800 Subject: [PATCH 03/24] mm: numa: call MMU notifiers on THP migration MMU notifiers must be called on THP page migration or secondary MMUs will get very confused. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Alex Thorlton Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 2cabbd5fa5bf..be787d506fbb 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -36,6 +36,7 @@ #include #include #include +#include #include @@ -1716,12 +1717,13 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, struct page *page, int node) { spinlock_t *ptl; - unsigned long haddr = address & HPAGE_PMD_MASK; pg_data_t *pgdat = NODE_DATA(node); int isolated = 0; struct page *new_page = NULL; struct mem_cgroup *memcg = NULL; int page_lru = page_is_file_cache(page); + unsigned long mmun_start = address & HPAGE_PMD_MASK; + unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE; pmd_t orig_entry; /* @@ -1756,10 +1758,12 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, WARN_ON(PageLRU(new_page)); /* Recheck the target PMD */ + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) { fail_putback: spin_unlock(ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); /* Reverse changes made by migrate_page_copy() */ if (TestClearPageActive(new_page)) @@ -1800,15 +1804,16 @@ fail_putback: * The SetPageUptodate on the new page and page_add_new_anon_rmap * guarantee the copy is visible before the pagetable update. */ - flush_cache_range(vma, haddr, haddr + HPAGE_PMD_SIZE); - page_add_new_anon_rmap(new_page, vma, haddr); - pmdp_clear_flush(vma, haddr, pmd); - set_pmd_at(mm, haddr, pmd, entry); + flush_cache_range(vma, mmun_start, mmun_end); + page_add_new_anon_rmap(new_page, vma, mmun_start); + pmdp_clear_flush(vma, mmun_start, pmd); + set_pmd_at(mm, mmun_start, pmd, entry); + flush_tlb_range(vma, mmun_start, mmun_end); update_mmu_cache_pmd(vma, address, &entry); if (page_count(page) != 2) { - set_pmd_at(mm, haddr, pmd, orig_entry); - flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE); + set_pmd_at(mm, mmun_start, pmd, orig_entry); + flush_tlb_range(vma, mmun_start, mmun_end); update_mmu_cache_pmd(vma, address, &entry); page_remove_rmap(new_page); goto fail_putback; @@ -1823,6 +1828,7 @@ fail_putback: */ mem_cgroup_end_migration(memcg, page, new_page, true); spin_unlock(ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); unlock_page(new_page); unlock_page(page); @@ -1843,7 +1849,7 @@ out_dropref: ptl = pmd_lock(mm, pmd); if (pmd_same(*pmd, entry)) { entry = pmd_mknonnuma(entry); - set_pmd_at(mm, haddr, pmd, entry); + set_pmd_at(mm, mmun_start, pmd, entry); update_mmu_cache_pmd(vma, address, &entry); } spin_unlock(ptl); From 67f87463d3a3362424efcbe8b40e4772fd34fc61 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 18 Dec 2013 17:08:34 -0800 Subject: [PATCH 04/24] mm: clear pmd_numa before invalidating On x86, PMD entries are similar to _PAGE_PROTNONE protection and are handled as NUMA hinting faults. The following two page table protection bits are what defines them _PAGE_NUMA:set _PAGE_PRESENT:clear A PMD is considered present if any of the _PAGE_PRESENT, _PAGE_PROTNONE, _PAGE_PSE or _PAGE_NUMA bits are set. If pmdp_invalidate encounters a pmd_numa, it clears the present bit leaving _PAGE_NUMA which will be considered not present by the CPU but present by pmd_present. The existing caller of pmdp_invalidate should handle it but it's an inconsistent state for a PMD. This patch keeps the state consistent when calling pmdp_invalidate. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Alex Thorlton Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pgtable-generic.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index cbb38545d9d6..e84cad27a801 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -191,6 +191,9 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { + pmd_t entry = *pmdp; + if (pmd_numa(entry)) + entry = pmd_mknonnuma(entry); set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp)); flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); } From 5a6dac3ec5f583cc8ee7bc53b5500a207c4ca433 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 18 Dec 2013 17:08:36 -0800 Subject: [PATCH 05/24] mm: numa: do not clear PMD during PTE update scan If the PMD is flushed then a parallel fault in handle_mm_fault() will enter the pmd_none and do_huge_pmd_anonymous_page() path where it'll attempt to insert a huge zero page. This is wasteful so the patch avoids clearing the PMD when setting pmd_numa. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Alex Thorlton Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 51f069303ab9..420826efda48 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1539,7 +1539,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, */ if (!is_huge_zero_page(page) && !pmd_numa(*pmd)) { - entry = pmdp_get_and_clear(mm, addr, pmd); + entry = *pmd; entry = pmd_mknuma(entry); ret = HPAGE_PMD_NR; } From 0c5f83c23ca703d32f930393825487257a5cde6d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 18 Dec 2013 17:08:37 -0800 Subject: [PATCH 06/24] mm: numa: do not clear PTE for pte_numa update The TLB must be flushed if the PTE is updated but change_pte_range is clearing the PTE while marking PTEs pte_numa without necessarily flushing the TLB if it reinserts the same entry. Without the flush, it's conceivable that two processors have different TLBs for the same virtual address and at the very least it would generate spurious faults. This patch only unmaps the pages in change_pte_range for a full protection change. [riel@redhat.com: write pte_numa pte back to the page tables] Signed-off-by: Mel Gorman Signed-off-by: Rik van Riel Reviewed-by: Rik van Riel Cc: Alex Thorlton Cc: Chegu Vinod Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mprotect.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 26667971c824..1291a053b167 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -52,17 +52,19 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, pte_t ptent; bool updated = false; - ptent = ptep_modify_prot_start(mm, addr, pte); if (!prot_numa) { + ptent = ptep_modify_prot_start(mm, addr, pte); ptent = pte_modify(ptent, newprot); updated = true; } else { struct page *page; + ptent = *pte; page = vm_normal_page(vma, addr, oldpte); if (page) { if (!pte_numa(oldpte)) { ptent = pte_mknuma(ptent); + set_pte_at(mm, addr, pte, ptent); updated = true; } } @@ -79,7 +81,10 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (updated) pages++; - ptep_modify_prot_commit(mm, addr, pte, ptent); + + /* Only !prot_numa always clears the pte */ + if (!prot_numa) + ptep_modify_prot_commit(mm, addr, pte, ptent); } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); From c3a489cac38d43ea6dc4ac240473b44b46deecf7 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 18 Dec 2013 17:08:38 -0800 Subject: [PATCH 07/24] mm: numa: ensure anon_vma is locked to prevent parallel THP splits The anon_vma lock prevents parallel THP splits and any associated complexity that arises when handling splits during THP migration. This patch checks if the lock was successfully acquired and bails from THP migration if it failed for any reason. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Alex Thorlton Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 420826efda48..dbafffa5e2ee 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1359,6 +1359,13 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out_unlock; } + /* Bail if we fail to protect against THP splits for any reason */ + if (unlikely(!anon_vma)) { + put_page(page); + page_nid = -1; + goto clear_pmdnuma; + } + /* * Migrate the THP to the requested node, returns with page unlocked * and pmd_numa cleared. From eb4489f69f224356193364dc2762aa009738ca7f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 18 Dec 2013 17:08:39 -0800 Subject: [PATCH 08/24] mm: numa: avoid unnecessary work on the failure path If a PMD changes during a THP migration then migration aborts but the failure path is doing more work than is necessary. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Alex Thorlton Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/migrate.c b/mm/migrate.c index be787d506fbb..a987525810ae 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1780,7 +1780,8 @@ fail_putback: putback_lru_page(page); mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); - goto out_fail; + + goto out_unlock; } /* @@ -1854,6 +1855,7 @@ out_dropref: } spin_unlock(ptl); +out_unlock: unlock_page(page); put_page(page); return 0; From 3c67f474558748b604e247d92b55dfe89654c81d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 18 Dec 2013 17:08:40 -0800 Subject: [PATCH 09/24] sched: numa: skip inaccessible VMAs Inaccessible VMA should not be trapping NUMA hint faults. Skip them. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Alex Thorlton Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/fair.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9030da7bcb15..c7395d97e4cb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1738,6 +1738,13 @@ void task_numa_work(struct callback_head *work) (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) continue; + /* + * Skip inaccessible VMAs to avoid any confusion between + * PROT_NONE and NUMA hinting ptes + */ + if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) + continue; + do { start = max(start, vma->vm_start); end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); From 1667918b6483b12a6496bf54151b827b8235d7b1 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 18 Dec 2013 17:08:41 -0800 Subject: [PATCH 10/24] mm: numa: clear numa hinting information on mprotect On a protection change it is no longer clear if the page should be still accessible. This patch clears the NUMA hinting fault bits on a protection change. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Alex Thorlton Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 2 ++ mm/mprotect.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index dbafffa5e2ee..70e7429fd8ea 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1532,6 +1532,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, ret = 1; if (!prot_numa) { entry = pmdp_get_and_clear(mm, addr, pmd); + if (pmd_numa(entry)) + entry = pmd_mknonnuma(entry); entry = pmd_modify(entry, newprot); ret = HPAGE_PMD_NR; BUG_ON(pmd_write(entry)); diff --git a/mm/mprotect.c b/mm/mprotect.c index 1291a053b167..f8421722acb9 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -54,6 +54,8 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (!prot_numa) { ptent = ptep_modify_prot_start(mm, addr, pte); + if (pte_numa(ptent)) + ptent = pte_mknonnuma(ptent); ptent = pte_modify(ptent, newprot); updated = true; } else { From de466bd628e8d663fdf3f791bc8db318ee85c714 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 18 Dec 2013 17:08:42 -0800 Subject: [PATCH 11/24] mm: numa: avoid unnecessary disruption of NUMA hinting during migration do_huge_pmd_numa_page() handles the case where there is parallel THP migration. However, by the time it is checked the NUMA hinting information has already been disrupted. This patch adds an earlier check with some helpers. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Alex Thorlton Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 9 +++++++++ mm/huge_memory.c | 22 ++++++++++++++++------ mm/migrate.c | 12 ++++++++++++ 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index f5096b58b20d..b7717d74da7f 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -90,10 +90,19 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_NUMA_BALANCING +extern bool pmd_trans_migrating(pmd_t pmd); +extern void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd); extern int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, int node); extern bool migrate_ratelimited(int node); #else +static inline bool pmd_trans_migrating(pmd_t pmd) +{ + return false; +} +static inline void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd) +{ +} static inline int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, int node) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 70e7429fd8ea..7de1bf85f683 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -882,6 +882,10 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, ret = 0; goto out_unlock; } + + /* mmap_sem prevents this happening but warn if that changes */ + WARN_ON(pmd_trans_migrating(pmd)); + if (unlikely(pmd_trans_splitting(pmd))) { /* split huge page running from under us */ spin_unlock(src_ptl); @@ -1299,6 +1303,17 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(!pmd_same(pmd, *pmdp))) goto out_unlock; + /* + * If there are potential migrations, wait for completion and retry + * without disrupting NUMA hinting information. Do not relock and + * check_same as the page may no longer be mapped. + */ + if (unlikely(pmd_trans_migrating(*pmdp))) { + spin_unlock(ptl); + wait_migrate_huge_page(vma->anon_vma, pmdp); + goto out; + } + page = pmd_page(pmd); BUG_ON(is_huge_zero_page(page)); page_nid = page_to_nid(page); @@ -1329,12 +1344,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, goto clear_pmdnuma; } - /* - * If there are potential migrations, wait for completion and retry. We - * do not relock and check_same as the page may no longer be mapped. - * Furtermore, even if the page is currently misplaced, there is no - * guarantee it is still misplaced after the migration completes. - */ + /* Migration could have started since the pmd_trans_migrating check */ if (!page_locked) { spin_unlock(ptl); wait_on_page_locked(page); diff --git a/mm/migrate.c b/mm/migrate.c index a987525810ae..cfb419085261 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1655,6 +1655,18 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) return 1; } +bool pmd_trans_migrating(pmd_t pmd) +{ + struct page *page = pmd_page(pmd); + return PageLocked(page); +} + +void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd) +{ + struct page *page = pmd_page(*pmd); + wait_on_page_locked(page); +} + /* * Attempt to migrate a misplaced page to the specified destination * node. Caller is expected to have an elevated reference count on From 20841405940e7be0617612d521e206e4b6b325db Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 18 Dec 2013 17:08:44 -0800 Subject: [PATCH 12/24] mm: fix TLB flush race between migration, and change_protection_range There are a few subtle races, between change_protection_range (used by mprotect and change_prot_numa) on one side, and NUMA page migration and compaction on the other side. The basic race is that there is a time window between when the PTE gets made non-present (PROT_NONE or NUMA), and the TLB is flushed. During that time, a CPU may continue writing to the page. This is fine most of the time, however compaction or the NUMA migration code may come in, and migrate the page away. When that happens, the CPU may continue writing, through the cached translation, to what is no longer the current memory location of the process. This only affects x86, which has a somewhat optimistic pte_accessible. All other architectures appear to be safe, and will either always flush, or flush whenever there is a valid mapping, even with no permissions (SPARC). The basic race looks like this: CPU A CPU B CPU C load TLB entry make entry PTE/PMD_NUMA fault on entry read/write old page start migrating page change PTE/PMD to new page read/write old page [*] flush TLB reload TLB from new entry read/write new page lose data [*] the old page may belong to a new user at this point! The obvious fix is to flush remote TLB entries, by making sure that pte_accessible aware of the fact that PROT_NONE and PROT_NUMA memory may still be accessible if there is a TLB flush pending for the mm. This should fix both NUMA migration and compaction. [mgorman@suse.de: fix build] Signed-off-by: Rik van Riel Signed-off-by: Mel Gorman Cc: Alex Thorlton Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/include/asm/pgtable_64.h | 4 +-- arch/x86/include/asm/pgtable.h | 11 ++++++-- include/asm-generic/pgtable.h | 2 +- include/linux/mm_types.h | 44 +++++++++++++++++++++++++++++ kernel/fork.c | 1 + mm/huge_memory.c | 7 +++++ mm/mprotect.c | 2 ++ mm/pgtable-generic.c | 5 ++-- 8 files changed, 69 insertions(+), 7 deletions(-) diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 8358dc144959..0f9e94537eee 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -619,7 +619,7 @@ static inline unsigned long pte_present(pte_t pte) } #define pte_accessible pte_accessible -static inline unsigned long pte_accessible(pte_t a) +static inline unsigned long pte_accessible(struct mm_struct *mm, pte_t a) { return pte_val(a) & _PAGE_VALID; } @@ -847,7 +847,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, * SUN4V NOTE: _PAGE_VALID is the same value in both the SUN4U * and SUN4V pte layout, so this inline test is fine. */ - if (likely(mm != &init_mm) && pte_accessible(orig)) + if (likely(mm != &init_mm) && pte_accessible(mm, orig)) tlb_batch_add(mm, addr, ptep, orig, fullmm); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 3d1999458709..bbc8b12fa443 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -452,9 +452,16 @@ static inline int pte_present(pte_t a) } #define pte_accessible pte_accessible -static inline int pte_accessible(pte_t a) +static inline bool pte_accessible(struct mm_struct *mm, pte_t a) { - return pte_flags(a) & _PAGE_PRESENT; + if (pte_flags(a) & _PAGE_PRESENT) + return true; + + if ((pte_flags(a) & (_PAGE_PROTNONE | _PAGE_NUMA)) && + mm_tlb_flush_pending(mm)) + return true; + + return false; } static inline int pte_hidden(pte_t pte) diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index f330d28e4d0e..b12079afbd5f 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -217,7 +217,7 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) #endif #ifndef pte_accessible -# define pte_accessible(pte) ((void)(pte),1) +# define pte_accessible(mm, pte) ((void)(pte), 1) #endif #ifndef flush_tlb_fix_spurious_fault diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bd299418a934..e5c49c30460f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -442,6 +442,14 @@ struct mm_struct { /* numa_scan_seq prevents two threads setting pte_numa */ int numa_scan_seq; +#endif +#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) + /* + * An operation with batched TLB flushing is going on. Anything that + * can move process memory needs to flush the TLB when moving a + * PROT_NONE or PROT_NUMA mapped page. + */ + bool tlb_flush_pending; #endif struct uprobes_state uprobes_state; }; @@ -459,4 +467,40 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) return mm->cpu_vm_mask_var; } +#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) +/* + * Memory barriers to keep this state in sync are graciously provided by + * the page table locks, outside of which no page table modifications happen. + * The barriers below prevent the compiler from re-ordering the instructions + * around the memory barriers that are already present in the code. + */ +static inline bool mm_tlb_flush_pending(struct mm_struct *mm) +{ + barrier(); + return mm->tlb_flush_pending; +} +static inline void set_tlb_flush_pending(struct mm_struct *mm) +{ + mm->tlb_flush_pending = true; + barrier(); +} +/* Clearing is done after a TLB flush, which also provides a barrier. */ +static inline void clear_tlb_flush_pending(struct mm_struct *mm) +{ + barrier(); + mm->tlb_flush_pending = false; +} +#else +static inline bool mm_tlb_flush_pending(struct mm_struct *mm) +{ + return false; +} +static inline void set_tlb_flush_pending(struct mm_struct *mm) +{ +} +static inline void clear_tlb_flush_pending(struct mm_struct *mm) +{ +} +#endif + #endif /* _LINUX_MM_TYPES_H */ diff --git a/kernel/fork.c b/kernel/fork.c index 728d5be9548c..5721f0e3f2da 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -537,6 +537,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) spin_lock_init(&mm->page_table_lock); mm_init_aio(mm); mm_init_owner(mm, p); + clear_tlb_flush_pending(mm); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7de1bf85f683..3d2783e10596 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1376,6 +1376,13 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, goto clear_pmdnuma; } + /* + * The page_table_lock above provides a memory barrier + * with change_protection_range. + */ + if (mm_tlb_flush_pending(mm)) + flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE); + /* * Migrate the THP to the requested node, returns with page unlocked * and pmd_numa cleared. diff --git a/mm/mprotect.c b/mm/mprotect.c index f8421722acb9..bb53a6591aea 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -188,6 +188,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, BUG_ON(addr >= end); pgd = pgd_offset(mm, addr); flush_cache_range(vma, addr, end); + set_tlb_flush_pending(mm); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) @@ -199,6 +200,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, /* Only flush the TLB if we actually modified any entries: */ if (pages) flush_tlb_range(vma, start, end); + clear_tlb_flush_pending(mm); return pages; } diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index e84cad27a801..a8b919925934 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -110,9 +110,10 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma, pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { + struct mm_struct *mm = (vma)->vm_mm; pte_t pte; - pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); - if (pte_accessible(pte)) + pte = ptep_get_and_clear(mm, address, ptep); + if (pte_accessible(mm, pte)) flush_tlb_page(vma, address); return pte; } From af2c1401e6f9177483be4fad876d0073669df9df Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 18 Dec 2013 17:08:45 -0800 Subject: [PATCH 13/24] mm: numa: guarantee that tlb_flush_pending updates are visible before page table updates According to documentation on barriers, stores issued before a LOCK can complete after the lock implying that it's possible tlb_flush_pending can be visible after a page table update. As per revised documentation, this patch adds a smp_mb__before_spinlock to guarantee the correct ordering. Signed-off-by: Mel Gorman Acked-by: Paul E. McKenney Reviewed-by: Rik van Riel Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e5c49c30460f..ad0616f2fe2c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -482,7 +482,12 @@ static inline bool mm_tlb_flush_pending(struct mm_struct *mm) static inline void set_tlb_flush_pending(struct mm_struct *mm) { mm->tlb_flush_pending = true; - barrier(); + + /* + * Guarantee that the tlb_flush_pending store does not leak into the + * critical section updating the page tables + */ + smp_mb__before_spinlock(); } /* Clearing is done after a TLB flush, which also provides a barrier. */ static inline void clear_tlb_flush_pending(struct mm_struct *mm) From b0943d61b8fa420180f92f64ef67662b4f6cc493 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 18 Dec 2013 17:08:46 -0800 Subject: [PATCH 14/24] mm: numa: defer TLB flush for THP migration as long as possible THP migration can fail for a variety of reasons. Avoid flushing the TLB to deal with THP migration races until the copy is ready to start. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Alex Thorlton Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 7 ------- mm/migrate.c | 3 +++ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3d2783e10596..7de1bf85f683 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1376,13 +1376,6 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, goto clear_pmdnuma; } - /* - * The page_table_lock above provides a memory barrier - * with change_protection_range. - */ - if (mm_tlb_flush_pending(mm)) - flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE); - /* * Migrate the THP to the requested node, returns with page unlocked * and pmd_numa cleared. diff --git a/mm/migrate.c b/mm/migrate.c index cfb419085261..e9b710201335 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1759,6 +1759,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, goto out_fail; } + if (mm_tlb_flush_pending(mm)) + flush_tlb_range(vma, mmun_start, mmun_end); + /* Prepare a page as a migration target */ __set_page_locked(new_page); SetPageSwapBacked(new_page); From 73f038b863dfe98acabc7c36c17342b84ad52e94 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 18 Dec 2013 17:08:47 -0800 Subject: [PATCH 15/24] mm: page_alloc: exclude unreclaimable allocations from zone fairness policy Dave Hansen noted a regression in a microbenchmark that loops around open() and close() on an 8-node NUMA machine and bisected it down to commit 81c0a2bb515f ("mm: page_alloc: fair zone allocator policy"). That change forces the slab allocations of the file descriptor to spread out to all 8 nodes, causing remote references in the page allocator and slab. The round-robin policy is only there to provide fairness among memory allocations that are reclaimed involuntarily based on pressure in each zone. It does not make sense to apply it to unreclaimable kernel allocations that are freed manually, in this case instantly after the allocation, and incur the remote reference costs twice for no reason. Only round-robin allocations that are usually freed through page reclaim or slab shrinking. Bisected by Dave Hansen. Signed-off-by: Johannes Weiner Cc: Dave Hansen Cc: Mel Gorman Reviewed-by: Rik van Riel Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 580a5f075ed0..f861d0257e90 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1920,7 +1920,8 @@ zonelist_scan: * back to remote zones that do not partake in the * fairness round-robin cycle of this zonelist. */ - if (alloc_flags & ALLOC_WMARK_LOW) { + if ((alloc_flags & ALLOC_WMARK_LOW) && + (gfp_mask & GFP_MOVABLE_MASK)) { if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) continue; if (zone_reclaim_mode && From 84ed8a99058e61567f495cc43118344261641c5f Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 18 Dec 2013 17:08:48 -0800 Subject: [PATCH 16/24] sh: always link in helper functions extracted from libgcc E.g. landisk_defconfig, which has CONFIG_NTFS_FS=m: ERROR: "__ashrdi3" [fs/ntfs/ntfs.ko] undefined! For "lib-y", if no symbols in a compilation unit are referenced by other units, the compilation unit will not be included in vmlinux. This breaks modules that do reference those symbols. Use "obj-y" instead to fix this. http://kisskb.ellerman.id.au/kisskb/buildresult/8838077/ This doesn't fix all cases. There are others, e.g. udivsi3. This is also not limited to sh, many architectures handle this in the same way. A simple solution is to unconditionally include all helper functions. A more complex solution is to make the choice of "lib-y" or "obj-y" depend on CONFIG_MODULES: obj-$(CONFIG_MODULES) += ... lib-y($CONFIG_MODULES) += ... Signed-off-by: Geert Uytterhoeven Cc: Paul Mundt Tested-by: Nobuhiro Iwamatsu Reviewed-by: Nobuhiro Iwamatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/lib/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sh/lib/Makefile b/arch/sh/lib/Makefile index 7b95f29e3174..3baff31e58cf 100644 --- a/arch/sh/lib/Makefile +++ b/arch/sh/lib/Makefile @@ -6,7 +6,7 @@ lib-y = delay.o memmove.o memchr.o \ checksum.o strlen.o div64.o div64-generic.o # Extracted from libgcc -lib-y += movmem.o ashldi3.o ashrdi3.o lshrdi3.o \ +obj-y += movmem.o ashldi3.o ashrdi3.o lshrdi3.o \ ashlsi3.o ashrsi3.o ashiftrt.o lshrsi3.o \ udiv_qrnnd.o From a844f38671d45ee6d981cfe41c5c4c2334578d73 Mon Sep 17 00:00:00 2001 From: Sima Baymani Date: Wed, 18 Dec 2013 17:08:49 -0800 Subject: [PATCH 17/24] mm: add missing dependency in Kconfig Eliminate the following (rand)config warning by adding missing PROC_FS dependency: warning: (HWPOISON_INJECT && MEM_SOFT_DIRTY) selects PROC_PAGE_MONITOR which has unmet direct dependencies (PROC_FS && MMU) Signed-off-by: Sima Baymani Suggested-by: David Rientjes Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/Kconfig b/mm/Kconfig index eb69f352401d..723bbe04a0b0 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -543,7 +543,7 @@ config ZSWAP config MEM_SOFT_DIRTY bool "Track memory changes" - depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY + depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS select PROC_PAGE_MONITOR help This option enables memory changes tracking by introducing a From b0e5fd7359f1ce8db4ccb862b3aa80d2f2cbf4d0 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 18 Dec 2013 17:08:51 -0800 Subject: [PATCH 18/24] mm/mempolicy: correct putback method for isolate pages if failed queue_pages_range() isolates hugetlbfs pages and putback_lru_pages() can't handle these. We should change it to putback_movable_pages(). Naoya said that it is worth going into stable, because it can break in-use hugepage list. Signed-off-by: Joonsoo Kim Acked-by: Rafael Aquini Reviewed-by: Naoya Horiguchi Reviewed-by: Wanpeng Li Cc: Christoph Lameter Cc: Vlastimil Babka Cc: Wanpeng Li Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Zhang Yanfei Cc: [3.12.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index eca4a3129129..6d04d372be29 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1318,7 +1318,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; } else - putback_lru_pages(&pagelist); + putback_movable_pages(&pagelist); up_write(&mm->mmap_sem); mpol_out: From 6815bf3f233e0b10c99a758497d5d236063b010b Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 18 Dec 2013 17:08:52 -0800 Subject: [PATCH 19/24] mm/compaction: respect ignore_skip_hint in update_pageblock_skip update_pageblock_skip() only fits to compaction which tries to isolate by pageblock unit. If isolate_migratepages_range() is called by CMA, it try to isolate regardless of pageblock unit and it don't reference get_pageblock_skip() by ignore_skip_hint. We should also respect it on update_pageblock_skip() to prevent from setting the wrong information. Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Reviewed-by: Naoya Horiguchi Reviewed-by: Wanpeng Li Cc: Christoph Lameter Cc: Rafael Aquini Cc: Vlastimil Babka Cc: Wanpeng Li Cc: Mel Gorman Cc: Rik van Riel Cc: Zhang Yanfei Cc: [3.7+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/compaction.c b/mm/compaction.c index 805165bcd3dd..f58bcd016f43 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -134,6 +134,10 @@ static void update_pageblock_skip(struct compact_control *cc, bool migrate_scanner) { struct zone *zone = cc->zone; + + if (cc->ignore_skip_hint) + return; + if (!page) return; From a49ecbcd7b0d5a1cda7d60e03df402dd0ef76ac8 Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Wed, 18 Dec 2013 17:08:54 -0800 Subject: [PATCH 20/24] mm/memory-failure.c: recheck PageHuge() after hugetlb page migrate successfully After a successful hugetlb page migration by soft offline, the source page will either be freed into hugepage_freelists or buddy(over-commit page). If page is in buddy, page_hstate(page) will be NULL. It will hit a NULL pointer dereference in dequeue_hwpoisoned_huge_page(). BUG: unable to handle kernel NULL pointer dereference at 0000000000000058 IP: [] dequeue_hwpoisoned_huge_page+0x131/0x1d0 PGD c23762067 PUD c24be2067 PMD 0 Oops: 0000 [#1] SMP So check PageHuge(page) after call migrate_pages() successfully. Signed-off-by: Jianguo Wu Tested-by: Naoya Horiguchi Reviewed-by: Naoya Horiguchi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index b7c171602ba1..db08af92c6fc 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1505,10 +1505,16 @@ static int soft_offline_huge_page(struct page *page, int flags) if (ret > 0) ret = -EIO; } else { - set_page_hwpoison_huge_page(hpage); - dequeue_hwpoisoned_huge_page(hpage); - atomic_long_add(1 << compound_order(hpage), - &num_poisoned_pages); + /* overcommit hugetlb page will be freed to buddy */ + if (PageHuge(page)) { + set_page_hwpoison_huge_page(hpage); + dequeue_hwpoisoned_huge_page(hpage); + atomic_long_add(1 << compound_order(hpage), + &num_poisoned_pages); + } else { + SetPageHWPoison(page); + atomic_long_inc(&num_poisoned_pages); + } } return ret; } From 584ec9794293ff69f597f5bec23296e62e94e857 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 18 Dec 2013 17:08:55 -0800 Subject: [PATCH 21/24] MAINTAINERS: add Davidlohr as GPT maintainer Add a new entry for the GPT standard. Any future changes will now be routed through linux-efi. Signed-off-by: Davidlohr Bueso Acked-by: Matt Fleming Cc: Jens Axboe Acked-by: Matt Domsch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 75e3db96dec5..3c99c2556405 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3833,6 +3833,12 @@ T: git git://linuxtv.org/media_tree.git S: Maintained F: drivers/media/usb/gspca/ +GUID PARTITION TABLE (GPT) +M: Davidlohr Bueso +L: linux-efi@vger.kernel.org +S: Maintained +F: block/partitions/efi.* + STK1160 USB VIDEO CAPTURE DRIVER M: Ezequiel Garcia L: linux-media@vger.kernel.org From 11c731e81bb0d8d2e835447a2dd645b34bb74706 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 18 Dec 2013 17:08:56 -0800 Subject: [PATCH 22/24] mm/mempolicy: fix !vma in new_vma_page() BUG_ON(!vma) assumption is introduced by commit 0bf598d863e3 ("mbind: add BUG_ON(!vma) in new_vma_page()"), however, even if address = __vma_address(page, vma); and vma->start < address < vma->end page_address_in_vma() may still return -EFAULT because of many other conditions in it. As a result the while loop in new_vma_page() may end with vma=NULL. This patch revert the commit and also fix the potential dereference NULL pointer reported by Dan. http://marc.info/?l=linux-mm&m=137689530323257&w=2 kernel BUG at mm/mempolicy.c:1204! invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC CPU: 3 PID: 7056 Comm: trinity-child3 Not tainted 3.13.0-rc3+ #2 task: ffff8801ca5295d0 ti: ffff88005ab20000 task.ti: ffff88005ab20000 RIP: new_vma_page+0x70/0x90 RSP: 0000:ffff88005ab21db0 EFLAGS: 00010246 RAX: fffffffffffffff2 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000008040075 RSI: ffff8801c3d74600 RDI: ffffea00079a8b80 RBP: ffff88005ab21dc8 R08: 0000000000000004 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: fffffffffffffff2 R13: ffffea00079a8b80 R14: 0000000000400000 R15: 0000000000400000 FS: 00007ff49c6f4740(0000) GS:ffff880244e00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ff49c68f994 CR3: 000000005a205000 CR4: 00000000001407e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Stack: ffffea00079a8b80 ffffea00079a8bc0 ffffea00079a8ba0 ffff88005ab21e50 ffffffff811adc7a 0000000000000000 ffff8801ca5295d0 0000000464e224f8 0000000000000000 0000000000000002 0000000000000000 ffff88020ce75c00 Call Trace: migrate_pages+0x12a/0x850 SYSC_mbind+0x513/0x6a0 SyS_mbind+0xe/0x10 ia32_do_call+0x13/0x13 Code: 85 c0 75 2f 4c 89 e1 48 89 da 31 f6 bf da 00 02 00 65 44 8b 04 25 08 f7 1c 00 e8 ec fd ff ff 5b 41 5c 41 5d 5d c3 0f 1f 44 00 00 <0f> 0b 66 0f 1f 44 00 00 4c 89 e6 48 89 df ba 01 00 00 00 e8 48 RIP [] new_vma_page+0x70/0x90 RSP Signed-off-by: Wanpeng Li Reported-by: Dave Jones Reported-by: Sasha Levin Reviewed-by: Naoya Horiguchi Reviewed-by: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 6d04d372be29..0cd2c4d4e270 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1197,14 +1197,16 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int * break; vma = vma->vm_next; } - /* - * queue_pages_range() confirms that @page belongs to some vma, - * so vma shouldn't be NULL. - */ - BUG_ON(!vma); - if (PageHuge(page)) - return alloc_huge_page_noerr(vma, address, 1); + if (PageHuge(page)) { + if (vma) + return alloc_huge_page_noerr(vma, address, 1); + else + return NULL; + } + /* + * if !vma, alloc_page_vma() will use task or system default policy + */ return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); } #else From 7ac181568342ec618d4da1ab03c2babcaa7ee84f Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 18 Dec 2013 17:08:57 -0800 Subject: [PATCH 23/24] fix build with make 3.80 According to Documentation/Changes, make 3.80 is still being supported for building the kernel, hence make files must not make (unconditional) use of features introduced only in newer versions. Commit 1bf49dd4be0b ("./Makefile: export initial ramdisk compression config option") however introduced "else ifeq" constructs which make 3.80 doesn't understand. Replace the logic there with more conventional (in the kernel build infrastructure) list constructs (except that the list here is intentionally limited to exactly one element). Signed-off-by: Jan Beulich Cc: P J P Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Makefile | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/Makefile b/Makefile index 858a147fd836..89cee8625651 100644 --- a/Makefile +++ b/Makefile @@ -732,19 +732,13 @@ export mod_strip_cmd # Select initial ramdisk compression format, default is gzip(1). # This shall be used by the dracut(8) tool while creating an initramfs image. # -INITRD_COMPRESS=gzip -ifeq ($(CONFIG_RD_BZIP2), y) - INITRD_COMPRESS=bzip2 -else ifeq ($(CONFIG_RD_LZMA), y) - INITRD_COMPRESS=lzma -else ifeq ($(CONFIG_RD_XZ), y) - INITRD_COMPRESS=xz -else ifeq ($(CONFIG_RD_LZO), y) - INITRD_COMPRESS=lzo -else ifeq ($(CONFIG_RD_LZ4), y) - INITRD_COMPRESS=lz4 -endif -export INITRD_COMPRESS +INITRD_COMPRESS-y := gzip +INITRD_COMPRESS-$(CONFIG_RD_BZIP2) := bzip2 +INITRD_COMPRESS-$(CONFIG_RD_LZMA) := lzma +INITRD_COMPRESS-$(CONFIG_RD_XZ) := xz +INITRD_COMPRESS-$(CONFIG_RD_LZO) := lzo +INITRD_COMPRESS-$(CONFIG_RD_LZ4) := lz4 +export INITRD_COMPRESS := $(INITRD_COMPRESS-y) ifdef CONFIG_MODULE_SIG_ALL MODSECKEY = ./signing_key.priv From 98398c32f6687ee1e1f3ae084effb4b75adb0747 Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Wed, 18 Dec 2013 17:08:59 -0800 Subject: [PATCH 24/24] mm/hugetlb: check for pte NULL pointer in __page_check_address() In __page_check_address(), if address's pud is not present, huge_pte_offset() will return NULL, we should check the return value. Signed-off-by: Jianguo Wu Cc: Naoya Horiguchi Cc: Mel Gorman Cc: qiuxishi Cc: Hanjun Guo Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/rmap.c b/mm/rmap.c index 55c8b8dc9ffb..068522d8502a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -600,7 +600,11 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm, spinlock_t *ptl; if (unlikely(PageHuge(page))) { + /* when pud is not present, pte will be NULL */ pte = huge_pte_offset(mm, address); + if (!pte) + return NULL; + ptl = huge_pte_lockptr(page_hstate(page), mm, pte); goto check; }