mm/khugepaged: collapse_pte_mapped_thp() with mmap_read_lock()
Bring collapse_and_free_pmd() back into collapse_pte_mapped_thp(). It does need mmap_read_lock(), but it does not need mmap_write_lock(), nor vma_start_write() nor i_mmap lock nor anon_vma lock. All racing paths are relying on pte_offset_map_lock() and pmd_lock(), so use those. Follow the pattern in retract_page_tables(); and using pte_free_defer() removes most of the need for tlb_remove_table_sync_one() here; but call pmdp_get_lockless_sync() to use it in the PAE case. First check the VMA, in case page tables are being torn down: from JannH. Confirm the preliminary find_pmd_or_thp_or_none() once page lock has been acquired and the page looks suitable: from then on its state is stable. However, collapse_pte_mapped_thp() was doing something others don't: freeing a page table still containing "valid" entries. i_mmap lock did stop a racing truncate from double-freeing those pages, but we prefer collapse_pte_mapped_thp() to clear the entries as usual. Their TLB flush can wait until the pmdp_collapse_flush() which follows, but the mmu_notifier_invalidate_range_start() has to be done earlier. Do the "step 1" checking loop without mmu_notifier: it wouldn't be good for khugepaged to keep on repeatedly invalidating a range which is then found unsuitable e.g. contains COWs. "step 2", which does the clearing, must then be more careful (after dropping ptl to do mmu_notifier), with abort prepared to correct the accounting like "step 3". But with those entries now cleared, "step 4" (after dropping ptl to do pmd_lock) is kept safe by the huge page lock, which stops new PTEs from being faulted in. [hughd@google.com: don't set mmap_locked = true in madvise_collapse()] Link: https://lkml.kernel.org/r/d3d9ff14-ef8-8f84-e160-bfa1f5794275@google.com [hughd@google.com: use ptep_clear() instead of pte_clear()] Link: https://lkml.kernel.org/r/e0197433-8a47-6a65-534d-eda26eeb78b0@google.com Link: https://lkml.kernel.org/r/b53be6a4-7715-51f9-aad-f1347dcb7c4@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Qi Zheng <zhengqi.arch@bytedance.com> Cc: Alexander Gordeev <agordeev@linux.ibm.com> Cc: Alistair Popple <apopple@nvidia.com> Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> Cc: Anshuman Khandual <anshuman.khandual@arm.com> Cc: Axel Rasmussen <axelrasmussen@google.com> Cc: Christian Borntraeger <borntraeger@linux.ibm.com> Cc: Christophe Leroy <christophe.leroy@csgroup.eu> Cc: Christoph Hellwig <hch@infradead.org> Cc: Claudio Imbrenda <imbrenda@linux.ibm.com> Cc: David Hildenbrand <david@redhat.com> Cc: "David S. Miller" <davem@davemloft.net> Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Huang, Ying <ying.huang@intel.com> Cc: Ira Weiny <ira.weiny@intel.com> Cc: Jann Horn <jannh@google.com> Cc: Jason Gunthorpe <jgg@ziepe.ca> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Lorenzo Stoakes <lstoakes@gmail.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Miaohe Lin <linmiaohe@huawei.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Mike Rapoport (IBM) <rppt@kernel.org> Cc: Minchan Kim <minchan@kernel.org> Cc: Naoya Horiguchi <naoya.horiguchi@nec.com> Cc: Pavel Tatashin <pasha.tatashin@soleen.com> Cc: Peter Xu <peterx@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ralph Campbell <rcampbell@nvidia.com> Cc: Russell King <linux@armlinux.org.uk> Cc: SeongJae Park <sj@kernel.org> Cc: Song Liu <song@kernel.org> Cc: Steven Price <steven.price@arm.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Will Deacon <will@kernel.org> Cc: Yang Shi <shy828301@gmail.com> Cc: Yu Zhao <yuzhao@google.com> Cc: Zack Rusin <zackr@vmware.com> Cc: Zi Yan <ziy@nvidia.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
parent
1d65b771bc
commit
1043173eb5
172
mm/khugepaged.c
172
mm/khugepaged.c
|
@ -1485,7 +1485,7 @@ static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
|
|||
return ret;
|
||||
}
|
||||
|
||||
/* hpage must be locked, and mmap_lock must be held in write */
|
||||
/* hpage must be locked, and mmap_lock must be held */
|
||||
static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
|
||||
pmd_t *pmdp, struct page *hpage)
|
||||
{
|
||||
|
@ -1497,7 +1497,7 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
|
|||
};
|
||||
|
||||
VM_BUG_ON(!PageTransHuge(hpage));
|
||||
mmap_assert_write_locked(vma->vm_mm);
|
||||
mmap_assert_locked(vma->vm_mm);
|
||||
|
||||
if (do_set_pmd(&vmf, hpage))
|
||||
return SCAN_FAIL;
|
||||
|
@ -1506,48 +1506,6 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
|
|||
return SCAN_SUCCEED;
|
||||
}
|
||||
|
||||
/*
|
||||
* A note about locking:
|
||||
* Trying to take the page table spinlocks would be useless here because those
|
||||
* are only used to synchronize:
|
||||
*
|
||||
* - modifying terminal entries (ones that point to a data page, not to another
|
||||
* page table)
|
||||
* - installing *new* non-terminal entries
|
||||
*
|
||||
* Instead, we need roughly the same kind of protection as free_pgtables() or
|
||||
* mm_take_all_locks() (but only for a single VMA):
|
||||
* The mmap lock together with this VMA's rmap locks covers all paths towards
|
||||
* the page table entries we're messing with here, except for hardware page
|
||||
* table walks and lockless_pages_from_mm().
|
||||
*/
|
||||
static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unsigned long addr, pmd_t *pmdp)
|
||||
{
|
||||
pmd_t pmd;
|
||||
struct mmu_notifier_range range;
|
||||
|
||||
mmap_assert_write_locked(mm);
|
||||
if (vma->vm_file)
|
||||
lockdep_assert_held_write(&vma->vm_file->f_mapping->i_mmap_rwsem);
|
||||
/*
|
||||
* All anon_vmas attached to the VMA have the same root and are
|
||||
* therefore locked by the same lock.
|
||||
*/
|
||||
if (vma->anon_vma)
|
||||
lockdep_assert_held_write(&vma->anon_vma->root->rwsem);
|
||||
|
||||
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
|
||||
addr + HPAGE_PMD_SIZE);
|
||||
mmu_notifier_invalidate_range_start(&range);
|
||||
pmd = pmdp_collapse_flush(vma, addr, pmdp);
|
||||
tlb_remove_table_sync_one();
|
||||
mmu_notifier_invalidate_range_end(&range);
|
||||
mm_dec_nr_ptes(mm);
|
||||
page_table_check_pte_clear_range(mm, addr, pmd);
|
||||
pte_free(mm, pmd_pgtable(pmd));
|
||||
}
|
||||
|
||||
/**
|
||||
* collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
|
||||
* address haddr.
|
||||
|
@ -1563,26 +1521,29 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v
|
|||
int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
|
||||
bool install_pmd)
|
||||
{
|
||||
struct mmu_notifier_range range;
|
||||
bool notified = false;
|
||||
unsigned long haddr = addr & HPAGE_PMD_MASK;
|
||||
struct vm_area_struct *vma = vma_lookup(mm, haddr);
|
||||
struct page *hpage;
|
||||
pte_t *start_pte, *pte;
|
||||
pmd_t *pmd;
|
||||
spinlock_t *ptl;
|
||||
int count = 0, result = SCAN_FAIL;
|
||||
pmd_t *pmd, pgt_pmd;
|
||||
spinlock_t *pml, *ptl;
|
||||
int nr_ptes = 0, result = SCAN_FAIL;
|
||||
int i;
|
||||
|
||||
mmap_assert_write_locked(mm);
|
||||
mmap_assert_locked(mm);
|
||||
|
||||
/* First check VMA found, in case page tables are being torn down */
|
||||
if (!vma || !vma->vm_file ||
|
||||
!range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
|
||||
return SCAN_VMA_CHECK;
|
||||
|
||||
/* Fast check before locking page if already PMD-mapped */
|
||||
result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
|
||||
if (result == SCAN_PMD_MAPPED)
|
||||
return result;
|
||||
|
||||
if (!vma || !vma->vm_file ||
|
||||
!range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
|
||||
return SCAN_VMA_CHECK;
|
||||
|
||||
/*
|
||||
* If we are here, we've succeeded in replacing all the native pages
|
||||
* in the page cache with a single hugepage. If a mm were to fault-in
|
||||
|
@ -1612,6 +1573,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
|
|||
goto drop_hpage;
|
||||
}
|
||||
|
||||
result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
|
||||
switch (result) {
|
||||
case SCAN_SUCCEED:
|
||||
break;
|
||||
|
@ -1625,27 +1587,10 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
|
|||
goto drop_hpage;
|
||||
}
|
||||
|
||||
/* Lock the vma before taking i_mmap and page table locks */
|
||||
vma_start_write(vma);
|
||||
|
||||
/*
|
||||
* We need to lock the mapping so that from here on, only GUP-fast and
|
||||
* hardware page walks can access the parts of the page tables that
|
||||
* we're operating on.
|
||||
* See collapse_and_free_pmd().
|
||||
*/
|
||||
i_mmap_lock_write(vma->vm_file->f_mapping);
|
||||
|
||||
/*
|
||||
* This spinlock should be unnecessary: Nobody else should be accessing
|
||||
* the page tables under spinlock protection here, only
|
||||
* lockless_pages_from_mm() and the hardware page walker can access page
|
||||
* tables while all the high-level locks are held in write mode.
|
||||
*/
|
||||
result = SCAN_FAIL;
|
||||
start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
|
||||
if (!start_pte)
|
||||
goto drop_immap;
|
||||
if (!start_pte) /* mmap_lock + page lock should prevent this */
|
||||
goto drop_hpage;
|
||||
|
||||
/* step 1: check all mapped PTEs are to the right huge page */
|
||||
for (i = 0, addr = haddr, pte = start_pte;
|
||||
|
@ -1672,10 +1617,18 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
|
|||
*/
|
||||
if (hpage + i != page)
|
||||
goto abort;
|
||||
count++;
|
||||
}
|
||||
|
||||
/* step 2: adjust rmap */
|
||||
pte_unmap_unlock(start_pte, ptl);
|
||||
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
|
||||
haddr, haddr + HPAGE_PMD_SIZE);
|
||||
mmu_notifier_invalidate_range_start(&range);
|
||||
notified = true;
|
||||
start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
|
||||
if (!start_pte) /* mmap_lock + page lock should prevent this */
|
||||
goto abort;
|
||||
|
||||
/* step 2: clear page table and adjust rmap */
|
||||
for (i = 0, addr = haddr, pte = start_pte;
|
||||
i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
|
||||
struct page *page;
|
||||
|
@ -1683,47 +1636,76 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
|
|||
|
||||
if (pte_none(ptent))
|
||||
continue;
|
||||
page = vm_normal_page(vma, addr, ptent);
|
||||
if (WARN_ON_ONCE(page && is_zone_device_page(page)))
|
||||
/*
|
||||
* We dropped ptl after the first scan, to do the mmu_notifier:
|
||||
* page lock stops more PTEs of the hpage being faulted in, but
|
||||
* does not stop write faults COWing anon copies from existing
|
||||
* PTEs; and does not stop those being swapped out or migrated.
|
||||
*/
|
||||
if (!pte_present(ptent)) {
|
||||
result = SCAN_PTE_NON_PRESENT;
|
||||
goto abort;
|
||||
}
|
||||
page = vm_normal_page(vma, addr, ptent);
|
||||
if (hpage + i != page)
|
||||
goto abort;
|
||||
|
||||
/*
|
||||
* Must clear entry, or a racing truncate may re-remove it.
|
||||
* TLB flush can be left until pmdp_collapse_flush() does it.
|
||||
* PTE dirty? Shmem page is already dirty; file is read-only.
|
||||
*/
|
||||
ptep_clear(mm, addr, pte);
|
||||
page_remove_rmap(page, vma, false);
|
||||
nr_ptes++;
|
||||
}
|
||||
|
||||
pte_unmap_unlock(start_pte, ptl);
|
||||
|
||||
/* step 3: set proper refcount and mm_counters. */
|
||||
if (count) {
|
||||
page_ref_sub(hpage, count);
|
||||
add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
|
||||
if (nr_ptes) {
|
||||
page_ref_sub(hpage, nr_ptes);
|
||||
add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes);
|
||||
}
|
||||
|
||||
/* step 4: remove pte entries */
|
||||
/* we make no change to anon, but protect concurrent anon page lookup */
|
||||
if (vma->anon_vma)
|
||||
anon_vma_lock_write(vma->anon_vma);
|
||||
/* step 4: remove page table */
|
||||
|
||||
collapse_and_free_pmd(mm, vma, haddr, pmd);
|
||||
/* Huge page lock is still held, so page table must remain empty */
|
||||
pml = pmd_lock(mm, pmd);
|
||||
if (ptl != pml)
|
||||
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
|
||||
pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd);
|
||||
pmdp_get_lockless_sync();
|
||||
if (ptl != pml)
|
||||
spin_unlock(ptl);
|
||||
spin_unlock(pml);
|
||||
|
||||
if (vma->anon_vma)
|
||||
anon_vma_unlock_write(vma->anon_vma);
|
||||
i_mmap_unlock_write(vma->vm_file->f_mapping);
|
||||
mmu_notifier_invalidate_range_end(&range);
|
||||
|
||||
mm_dec_nr_ptes(mm);
|
||||
page_table_check_pte_clear_range(mm, haddr, pgt_pmd);
|
||||
pte_free_defer(mm, pmd_pgtable(pgt_pmd));
|
||||
|
||||
maybe_install_pmd:
|
||||
/* step 5: install pmd entry */
|
||||
result = install_pmd
|
||||
? set_huge_pmd(vma, haddr, pmd, hpage)
|
||||
: SCAN_SUCCEED;
|
||||
|
||||
goto drop_hpage;
|
||||
abort:
|
||||
if (nr_ptes) {
|
||||
flush_tlb_mm(mm);
|
||||
page_ref_sub(hpage, nr_ptes);
|
||||
add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes);
|
||||
}
|
||||
if (start_pte)
|
||||
pte_unmap_unlock(start_pte, ptl);
|
||||
if (notified)
|
||||
mmu_notifier_invalidate_range_end(&range);
|
||||
drop_hpage:
|
||||
unlock_page(hpage);
|
||||
put_page(hpage);
|
||||
return result;
|
||||
|
||||
abort:
|
||||
pte_unmap_unlock(start_pte, ptl);
|
||||
drop_immap:
|
||||
i_mmap_unlock_write(vma->vm_file->f_mapping);
|
||||
goto drop_hpage;
|
||||
}
|
||||
|
||||
static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot)
|
||||
|
@ -2856,9 +2838,9 @@ handle_result:
|
|||
case SCAN_PTE_MAPPED_HUGEPAGE:
|
||||
BUG_ON(mmap_locked);
|
||||
BUG_ON(*prev);
|
||||
mmap_write_lock(mm);
|
||||
mmap_read_lock(mm);
|
||||
result = collapse_pte_mapped_thp(mm, addr, true);
|
||||
mmap_write_unlock(mm);
|
||||
mmap_read_unlock(mm);
|
||||
goto handle_result;
|
||||
/* Whitelisted set of results where continuing OK */
|
||||
case SCAN_PMD_NULL:
|
||||
|
|
Loading…
Reference in New Issue