mm,thp: add read-only THP support for (non-shmem) FS
This patch is (hopefully) the first step to enable THP for non-shmem filesystems. This patch enables an application to put part of its text sections to THP via madvise, for example: madvise((void *)0x600000, 0x200000, MADV_HUGEPAGE); We tried to reuse the logic for THP on tmpfs. Currently, write is not supported for non-shmem THP. khugepaged will only process vma with VM_DENYWRITE. sys_mmap() ignores VM_DENYWRITE requests (see ksys_mmap_pgoff). The only way to create vma with VM_DENYWRITE is execve(). This requirement limits non-shmem THP to text sections. The next patch will handle writes, which would only happen when the all the vmas with VM_DENYWRITE are unmapped. An EXPERIMENTAL config, READ_ONLY_THP_FOR_FS, is added to gate this feature. [songliubraving@fb.com: fix build without CONFIG_SHMEM] Link: http://lkml.kernel.org/r/F53407FB-96CC-42E8-9862-105C92CC2B98@fb.com [songliubraving@fb.com: fix double unlock in collapse_file()] Link: http://lkml.kernel.org/r/B960CBFA-8EFC-4DA4-ABC5-1977FFF2CA57@fb.com Link: http://lkml.kernel.org/r/20190801184244.3169074-7-songliubraving@fb.com Signed-off-by: Song Liu <songliubraving@fb.com> Acked-by: Rik van Riel <riel@surriel.com> Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Cc: Dan Carpenter <dan.carpenter@oracle.com> Cc: Hillf Danton <hdanton@sina.com> Cc: Hugh Dickins <hughd@google.com> Cc: William Kucharski <william.kucharski@oracle.com> Cc: Oleg Nesterov <oleg@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
parent
579c571e2e
commit
99cb0dbd47
11
mm/Kconfig
11
mm/Kconfig
|
@ -712,6 +712,17 @@ config GUP_BENCHMARK
|
||||||
config GUP_GET_PTE_LOW_HIGH
|
config GUP_GET_PTE_LOW_HIGH
|
||||||
bool
|
bool
|
||||||
|
|
||||||
|
config READ_ONLY_THP_FOR_FS
|
||||||
|
bool "Read-only THP for filesystems (EXPERIMENTAL)"
|
||||||
|
depends on TRANSPARENT_HUGE_PAGECACHE && SHMEM
|
||||||
|
|
||||||
|
help
|
||||||
|
Allow khugepaged to put read-only file-backed pages in THP.
|
||||||
|
|
||||||
|
This is marked experimental because it is a new feature. Write
|
||||||
|
support of file THPs will be developed in the next few release
|
||||||
|
cycles.
|
||||||
|
|
||||||
config ARCH_HAS_PTE_SPECIAL
|
config ARCH_HAS_PTE_SPECIAL
|
||||||
bool
|
bool
|
||||||
|
|
||||||
|
|
|
@ -203,8 +203,8 @@ static void unaccount_page_cache_page(struct address_space *mapping,
|
||||||
__mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
|
__mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
|
||||||
if (PageTransHuge(page))
|
if (PageTransHuge(page))
|
||||||
__dec_node_page_state(page, NR_SHMEM_THPS);
|
__dec_node_page_state(page, NR_SHMEM_THPS);
|
||||||
} else {
|
} else if (PageTransHuge(page)) {
|
||||||
VM_BUG_ON_PAGE(PageTransHuge(page), page);
|
__dec_node_page_state(page, NR_FILE_THPS);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -48,6 +48,7 @@ enum scan_result {
|
||||||
SCAN_CGROUP_CHARGE_FAIL,
|
SCAN_CGROUP_CHARGE_FAIL,
|
||||||
SCAN_EXCEED_SWAP_PTE,
|
SCAN_EXCEED_SWAP_PTE,
|
||||||
SCAN_TRUNCATED,
|
SCAN_TRUNCATED,
|
||||||
|
SCAN_PAGE_HAS_PRIVATE,
|
||||||
};
|
};
|
||||||
|
|
||||||
#define CREATE_TRACE_POINTS
|
#define CREATE_TRACE_POINTS
|
||||||
|
@ -404,7 +405,11 @@ static bool hugepage_vma_check(struct vm_area_struct *vma,
|
||||||
(vm_flags & VM_NOHUGEPAGE) ||
|
(vm_flags & VM_NOHUGEPAGE) ||
|
||||||
test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
|
test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
|
||||||
return false;
|
return false;
|
||||||
if (shmem_file(vma->vm_file)) {
|
|
||||||
|
if (shmem_file(vma->vm_file) ||
|
||||||
|
(IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
|
||||||
|
vma->vm_file &&
|
||||||
|
(vm_flags & VM_DENYWRITE))) {
|
||||||
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
|
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
|
||||||
return false;
|
return false;
|
||||||
return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
|
return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
|
||||||
|
@ -456,8 +461,9 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
|
||||||
unsigned long hstart, hend;
|
unsigned long hstart, hend;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* khugepaged does not yet work on non-shmem files or special
|
* khugepaged only supports read-only files for non-shmem files.
|
||||||
* mappings. And file-private shmem THP is not supported.
|
* khugepaged does not yet work on special mappings. And
|
||||||
|
* file-private shmem THP is not supported.
|
||||||
*/
|
*/
|
||||||
if (!hugepage_vma_check(vma, vm_flags))
|
if (!hugepage_vma_check(vma, vm_flags))
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -1287,12 +1293,12 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* collapse_file - collapse small tmpfs/shmem pages into huge one.
|
* collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
|
||||||
*
|
*
|
||||||
* Basic scheme is simple, details are more complex:
|
* Basic scheme is simple, details are more complex:
|
||||||
* - allocate and lock a new huge page;
|
* - allocate and lock a new huge page;
|
||||||
* - scan page cache replacing old pages with the new one
|
* - scan page cache replacing old pages with the new one
|
||||||
* + swap in pages if necessary;
|
* + swap/gup in pages if necessary;
|
||||||
* + fill in gaps;
|
* + fill in gaps;
|
||||||
* + keep old pages around in case rollback is required;
|
* + keep old pages around in case rollback is required;
|
||||||
* - if replacing succeeds:
|
* - if replacing succeeds:
|
||||||
|
@ -1316,7 +1322,9 @@ static void collapse_file(struct mm_struct *mm,
|
||||||
LIST_HEAD(pagelist);
|
LIST_HEAD(pagelist);
|
||||||
XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
|
XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
|
||||||
int nr_none = 0, result = SCAN_SUCCEED;
|
int nr_none = 0, result = SCAN_SUCCEED;
|
||||||
|
bool is_shmem = shmem_file(file);
|
||||||
|
|
||||||
|
VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
|
||||||
VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
|
VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
|
||||||
|
|
||||||
/* Only allocate from the target node */
|
/* Only allocate from the target node */
|
||||||
|
@ -1348,6 +1356,7 @@ static void collapse_file(struct mm_struct *mm,
|
||||||
} while (1);
|
} while (1);
|
||||||
|
|
||||||
__SetPageLocked(new_page);
|
__SetPageLocked(new_page);
|
||||||
|
if (is_shmem)
|
||||||
__SetPageSwapBacked(new_page);
|
__SetPageSwapBacked(new_page);
|
||||||
new_page->index = start;
|
new_page->index = start;
|
||||||
new_page->mapping = mapping;
|
new_page->mapping = mapping;
|
||||||
|
@ -1363,10 +1372,12 @@ static void collapse_file(struct mm_struct *mm,
|
||||||
struct page *page = xas_next(&xas);
|
struct page *page = xas_next(&xas);
|
||||||
|
|
||||||
VM_BUG_ON(index != xas.xa_index);
|
VM_BUG_ON(index != xas.xa_index);
|
||||||
|
if (is_shmem) {
|
||||||
if (!page) {
|
if (!page) {
|
||||||
/*
|
/*
|
||||||
* Stop if extent has been truncated or hole-punched,
|
* Stop if extent has been truncated or
|
||||||
* and is now completely empty.
|
* hole-punched, and is now completely
|
||||||
|
* empty.
|
||||||
*/
|
*/
|
||||||
if (index == start) {
|
if (index == start) {
|
||||||
if (!xas_next_entry(&xas, end - 1)) {
|
if (!xas_next_entry(&xas, end - 1)) {
|
||||||
|
@ -1399,6 +1410,38 @@ static void collapse_file(struct mm_struct *mm,
|
||||||
result = SCAN_PAGE_LOCK;
|
result = SCAN_PAGE_LOCK;
|
||||||
goto xa_locked;
|
goto xa_locked;
|
||||||
}
|
}
|
||||||
|
} else { /* !is_shmem */
|
||||||
|
if (!page || xa_is_value(page)) {
|
||||||
|
xas_unlock_irq(&xas);
|
||||||
|
page_cache_sync_readahead(mapping, &file->f_ra,
|
||||||
|
file, index,
|
||||||
|
PAGE_SIZE);
|
||||||
|
/* drain pagevecs to help isolate_lru_page() */
|
||||||
|
lru_add_drain();
|
||||||
|
page = find_lock_page(mapping, index);
|
||||||
|
if (unlikely(page == NULL)) {
|
||||||
|
result = SCAN_FAIL;
|
||||||
|
goto xa_unlocked;
|
||||||
|
}
|
||||||
|
} else if (!PageUptodate(page)) {
|
||||||
|
xas_unlock_irq(&xas);
|
||||||
|
wait_on_page_locked(page);
|
||||||
|
if (!trylock_page(page)) {
|
||||||
|
result = SCAN_PAGE_LOCK;
|
||||||
|
goto xa_unlocked;
|
||||||
|
}
|
||||||
|
get_page(page);
|
||||||
|
} else if (PageDirty(page)) {
|
||||||
|
result = SCAN_FAIL;
|
||||||
|
goto xa_locked;
|
||||||
|
} else if (trylock_page(page)) {
|
||||||
|
get_page(page);
|
||||||
|
xas_unlock_irq(&xas);
|
||||||
|
} else {
|
||||||
|
result = SCAN_PAGE_LOCK;
|
||||||
|
goto xa_locked;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The page must be locked, so we can drop the i_pages lock
|
* The page must be locked, so we can drop the i_pages lock
|
||||||
|
@ -1426,6 +1469,12 @@ static void collapse_file(struct mm_struct *mm,
|
||||||
goto out_unlock;
|
goto out_unlock;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (page_has_private(page) &&
|
||||||
|
!try_to_release_page(page, GFP_KERNEL)) {
|
||||||
|
result = SCAN_PAGE_HAS_PRIVATE;
|
||||||
|
goto out_unlock;
|
||||||
|
}
|
||||||
|
|
||||||
if (page_mapped(page))
|
if (page_mapped(page))
|
||||||
unmap_mapping_pages(mapping, index, 1, false);
|
unmap_mapping_pages(mapping, index, 1, false);
|
||||||
|
|
||||||
|
@ -1463,12 +1512,18 @@ out_unlock:
|
||||||
goto xa_unlocked;
|
goto xa_unlocked;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (is_shmem)
|
||||||
__inc_node_page_state(new_page, NR_SHMEM_THPS);
|
__inc_node_page_state(new_page, NR_SHMEM_THPS);
|
||||||
|
else
|
||||||
|
__inc_node_page_state(new_page, NR_FILE_THPS);
|
||||||
|
|
||||||
if (nr_none) {
|
if (nr_none) {
|
||||||
struct zone *zone = page_zone(new_page);
|
struct zone *zone = page_zone(new_page);
|
||||||
|
|
||||||
__mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none);
|
__mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none);
|
||||||
__mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none);
|
if (is_shmem)
|
||||||
|
__mod_node_page_state(zone->zone_pgdat,
|
||||||
|
NR_SHMEM, nr_none);
|
||||||
}
|
}
|
||||||
|
|
||||||
xa_locked:
|
xa_locked:
|
||||||
|
@ -1506,10 +1561,15 @@ xa_unlocked:
|
||||||
|
|
||||||
SetPageUptodate(new_page);
|
SetPageUptodate(new_page);
|
||||||
page_ref_add(new_page, HPAGE_PMD_NR - 1);
|
page_ref_add(new_page, HPAGE_PMD_NR - 1);
|
||||||
set_page_dirty(new_page);
|
|
||||||
mem_cgroup_commit_charge(new_page, memcg, false, true);
|
mem_cgroup_commit_charge(new_page, memcg, false, true);
|
||||||
count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1);
|
|
||||||
|
if (is_shmem) {
|
||||||
|
set_page_dirty(new_page);
|
||||||
lru_cache_add_anon(new_page);
|
lru_cache_add_anon(new_page);
|
||||||
|
} else {
|
||||||
|
lru_cache_add_file(new_page);
|
||||||
|
}
|
||||||
|
count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Remove pte page tables, so we can re-fault the page as huge.
|
* Remove pte page tables, so we can re-fault the page as huge.
|
||||||
|
@ -1524,6 +1584,8 @@ xa_unlocked:
|
||||||
/* Something went wrong: roll back page cache changes */
|
/* Something went wrong: roll back page cache changes */
|
||||||
xas_lock_irq(&xas);
|
xas_lock_irq(&xas);
|
||||||
mapping->nrpages -= nr_none;
|
mapping->nrpages -= nr_none;
|
||||||
|
|
||||||
|
if (is_shmem)
|
||||||
shmem_uncharge(mapping->host, nr_none);
|
shmem_uncharge(mapping->host, nr_none);
|
||||||
|
|
||||||
xas_set(&xas, start);
|
xas_set(&xas, start);
|
||||||
|
@ -1607,7 +1669,8 @@ static void khugepaged_scan_file(struct mm_struct *mm,
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (page_count(page) != 1 + page_mapcount(page)) {
|
if (page_count(page) !=
|
||||||
|
1 + page_mapcount(page) + page_has_private(page)) {
|
||||||
result = SCAN_PAGE_COUNT;
|
result = SCAN_PAGE_COUNT;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -1713,11 +1776,13 @@ skip:
|
||||||
VM_BUG_ON(khugepaged_scan.address < hstart ||
|
VM_BUG_ON(khugepaged_scan.address < hstart ||
|
||||||
khugepaged_scan.address + HPAGE_PMD_SIZE >
|
khugepaged_scan.address + HPAGE_PMD_SIZE >
|
||||||
hend);
|
hend);
|
||||||
if (shmem_file(vma->vm_file)) {
|
if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
|
||||||
struct file *file;
|
struct file *file;
|
||||||
pgoff_t pgoff = linear_page_index(vma,
|
pgoff_t pgoff = linear_page_index(vma,
|
||||||
khugepaged_scan.address);
|
khugepaged_scan.address);
|
||||||
if (!shmem_huge_enabled(vma))
|
|
||||||
|
if (shmem_file(vma->vm_file)
|
||||||
|
&& !shmem_huge_enabled(vma))
|
||||||
goto skip;
|
goto skip;
|
||||||
file = get_file(vma->vm_file);
|
file = get_file(vma->vm_file);
|
||||||
up_read(&mm->mmap_sem);
|
up_read(&mm->mmap_sem);
|
||||||
|
|
|
@ -1189,8 +1189,10 @@ void page_add_file_rmap(struct page *page, bool compound)
|
||||||
}
|
}
|
||||||
if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
|
if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
|
||||||
goto out;
|
goto out;
|
||||||
VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
|
if (PageSwapBacked(page))
|
||||||
__inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
|
__inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
|
||||||
|
else
|
||||||
|
__inc_node_page_state(page, NR_FILE_PMDMAPPED);
|
||||||
} else {
|
} else {
|
||||||
if (PageTransCompound(page) && page_mapping(page)) {
|
if (PageTransCompound(page) && page_mapping(page)) {
|
||||||
VM_WARN_ON_ONCE(!PageLocked(page));
|
VM_WARN_ON_ONCE(!PageLocked(page));
|
||||||
|
@ -1229,8 +1231,10 @@ static void page_remove_file_rmap(struct page *page, bool compound)
|
||||||
}
|
}
|
||||||
if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
|
if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
|
||||||
goto out;
|
goto out;
|
||||||
VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
|
if (PageSwapBacked(page))
|
||||||
__dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
|
__dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
|
||||||
|
else
|
||||||
|
__dec_node_page_state(page, NR_FILE_PMDMAPPED);
|
||||||
} else {
|
} else {
|
||||||
if (!atomic_add_negative(-1, &page->_mapcount))
|
if (!atomic_add_negative(-1, &page->_mapcount))
|
||||||
goto out;
|
goto out;
|
||||||
|
|
Loading…
Reference in New Issue