Two followon fixes for the post-5.19 series "Use pageblock_order for cma
and alloc_contig_range alignment", from Zi Yan. A series of z3fold cleanups and fixes from Miaohe Lin. Some memcg selftests work from Michal Koutný <mkoutny@suse.com> Some swap fixes and cleanups from Miaohe Lin. Several individual minor fixups. -----BEGIN PGP SIGNATURE----- iHUEABYKAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCYpEE7QAKCRDdBJ7gKXxA jlamAP9WmjNdx+5Pz5OkkaSjBO7y7vBrBTcQ9e5pz8bUWRoQhwEA+WtsssLmq9aI 7DBDmBKYCMTbzOQTqaMRHkB+JWZo+Ao= =L3f1 -----END PGP SIGNATURE----- Merge tag 'mm-stable-2022-05-27' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Pull more MM updates from Andrew Morton: - Two follow-on fixes for the post-5.19 series "Use pageblock_order for cma and alloc_contig_range alignment", from Zi Yan. - A series of z3fold cleanups and fixes from Miaohe Lin. - Some memcg selftests work from Michal Koutný <mkoutny@suse.com> - Some swap fixes and cleanups from Miaohe Lin - Several individual minor fixups * tag 'mm-stable-2022-05-27' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (25 commits) mm/shmem.c: suppress shift warning mm: Kconfig: reorganize misplaced mm options mm: kasan: fix input of vmalloc_to_page() mm: fix is_pinnable_page against a cma page mm: filter out swapin error entry in shmem mapping mm/shmem: fix infinite loop when swap in shmem error at swapoff time mm/madvise: free hwpoison and swapin error entry in madvise_free_pte_range mm/swapfile: fix lost swap bits in unuse_pte() mm/swapfile: unuse_pte can map random data if swap read fails selftests: memcg: factor out common parts of memory.{low,min} tests selftests: memcg: remove protection from top level memcg selftests: memcg: adjust expected reclaim values of protected cgroups selftests: memcg: expect no low events in unprotected sibling selftests: memcg: fix compilation mm/z3fold: fix z3fold_page_migrate races with z3fold_map mm/z3fold: fix z3fold_reclaim_page races with z3fold_free mm/z3fold: always clear PAGE_CLAIMED under z3fold page lock mm/z3fold: put z3fold page back into unbuddied list when reclaim or migration fails revert "mm/z3fold.c: allow __GFP_HIGHMEM in z3fold_alloc" mm/z3fold: throw warning on failure of trylock_page in z3fold_alloc ...
This commit is contained in:
commit
8291eaafed
|
@ -5062,6 +5062,7 @@ L: linux-mm@kvack.org
|
|||
S: Maintained
|
||||
F: mm/memcontrol.c
|
||||
F: mm/swap_cgroup.c
|
||||
F: tools/testing/selftests/cgroup/memcg_protection.m
|
||||
F: tools/testing/selftests/cgroup/test_kmem.c
|
||||
F: tools/testing/selftests/cgroup/test_memcontrol.c
|
||||
|
||||
|
|
|
@ -1594,8 +1594,13 @@ static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma,
|
|||
#ifdef CONFIG_MIGRATION
|
||||
static inline bool is_pinnable_page(struct page *page)
|
||||
{
|
||||
return !(is_zone_movable_page(page) || is_migrate_cma_page(page)) ||
|
||||
is_zero_pfn(page_to_pfn(page));
|
||||
#ifdef CONFIG_CMA
|
||||
int mt = get_pageblock_migratetype(page);
|
||||
|
||||
if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE)
|
||||
return false;
|
||||
#endif
|
||||
return !(is_zone_movable_page(page) || is_zero_pfn(page_to_pfn(page)));
|
||||
}
|
||||
#else
|
||||
static inline bool is_pinnable_page(struct page *page)
|
||||
|
|
|
@ -55,6 +55,10 @@ static inline int current_is_kswapd(void)
|
|||
* actions on faults.
|
||||
*/
|
||||
|
||||
#define SWP_SWAPIN_ERROR_NUM 1
|
||||
#define SWP_SWAPIN_ERROR (MAX_SWAPFILES + SWP_HWPOISON_NUM + \
|
||||
SWP_MIGRATION_NUM + SWP_DEVICE_NUM + \
|
||||
SWP_PTE_MARKER_NUM)
|
||||
/*
|
||||
* PTE markers are used to persist information onto PTEs that are mapped with
|
||||
* file-backed memories. As its name "PTE" hints, it should only be applied to
|
||||
|
@ -120,7 +124,8 @@ static inline int current_is_kswapd(void)
|
|||
|
||||
#define MAX_SWAPFILES \
|
||||
((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \
|
||||
SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - SWP_PTE_MARKER_NUM)
|
||||
SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - \
|
||||
SWP_PTE_MARKER_NUM - SWP_SWAPIN_ERROR_NUM)
|
||||
|
||||
/*
|
||||
* Magic header for a swap area. The first part of the union is
|
||||
|
|
|
@ -108,6 +108,16 @@ static inline void *swp_to_radix_entry(swp_entry_t entry)
|
|||
return xa_mk_value(entry.val);
|
||||
}
|
||||
|
||||
static inline swp_entry_t make_swapin_error_entry(struct page *page)
|
||||
{
|
||||
return swp_entry(SWP_SWAPIN_ERROR, page_to_pfn(page));
|
||||
}
|
||||
|
||||
static inline int is_swapin_error_entry(swp_entry_t entry)
|
||||
{
|
||||
return swp_type(entry) == SWP_SWAPIN_ERROR;
|
||||
}
|
||||
|
||||
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
|
||||
static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset)
|
||||
{
|
||||
|
|
54
init/Kconfig
54
init/Kconfig
|
@ -1842,60 +1842,6 @@ config DEBUG_PERF_USE_VMALLOC
|
|||
|
||||
endmenu
|
||||
|
||||
config VM_EVENT_COUNTERS
|
||||
default y
|
||||
bool "Enable VM event counters for /proc/vmstat" if EXPERT
|
||||
help
|
||||
VM event counters are needed for event counts to be shown.
|
||||
This option allows the disabling of the VM event counters
|
||||
on EXPERT systems. /proc/vmstat will only show page counts
|
||||
if VM event counters are disabled.
|
||||
|
||||
config SLUB_DEBUG
|
||||
default y
|
||||
bool "Enable SLUB debugging support" if EXPERT
|
||||
depends on SLUB && SYSFS
|
||||
select STACKDEPOT if STACKTRACE_SUPPORT
|
||||
help
|
||||
SLUB has extensive debug support features. Disabling these can
|
||||
result in significant savings in code size. This also disables
|
||||
SLUB sysfs support. /sys/slab will not exist and there will be
|
||||
no support for cache validation etc.
|
||||
|
||||
config COMPAT_BRK
|
||||
bool "Disable heap randomization"
|
||||
default y
|
||||
help
|
||||
Randomizing heap placement makes heap exploits harder, but it
|
||||
also breaks ancient binaries (including anything libc5 based).
|
||||
This option changes the bootup default to heap randomization
|
||||
disabled, and can be overridden at runtime by setting
|
||||
/proc/sys/kernel/randomize_va_space to 2.
|
||||
|
||||
On non-ancient distros (post-2000 ones) N is usually a safe choice.
|
||||
|
||||
config MMAP_ALLOW_UNINITIALIZED
|
||||
bool "Allow mmapped anonymous memory to be uninitialized"
|
||||
depends on EXPERT && !MMU
|
||||
default n
|
||||
help
|
||||
Normally, and according to the Linux spec, anonymous memory obtained
|
||||
from mmap() has its contents cleared before it is passed to
|
||||
userspace. Enabling this config option allows you to request that
|
||||
mmap() skip that if it is given an MAP_UNINITIALIZED flag, thus
|
||||
providing a huge performance boost. If this option is not enabled,
|
||||
then the flag will be ignored.
|
||||
|
||||
This is taken advantage of by uClibc's malloc(), and also by
|
||||
ELF-FDPIC binfmt's brk and stack allocator.
|
||||
|
||||
Because of the obvious security issues, this option should only be
|
||||
enabled on embedded devices where you control what is run in
|
||||
userspace. Since that isn't generally a problem on no-MMU systems,
|
||||
it is normally safe to say Y here.
|
||||
|
||||
See Documentation/admin-guide/mm/nommu-mmap.rst for more information.
|
||||
|
||||
config SYSTEM_DATA_VERIFICATION
|
||||
def_bool n
|
||||
select SYSTEM_TRUSTED_KEYRING
|
||||
|
|
|
@ -699,41 +699,6 @@ config DEBUG_OBJECTS_ENABLE_DEFAULT
|
|||
help
|
||||
Debug objects boot parameter default value
|
||||
|
||||
config DEBUG_SLAB
|
||||
bool "Debug slab memory allocations"
|
||||
depends on DEBUG_KERNEL && SLAB
|
||||
help
|
||||
Say Y here to have the kernel do limited verification on memory
|
||||
allocation as well as poisoning memory on free to catch use of freed
|
||||
memory. This can make kmalloc/kfree-intensive workloads much slower.
|
||||
|
||||
config SLUB_DEBUG_ON
|
||||
bool "SLUB debugging on by default"
|
||||
depends on SLUB && SLUB_DEBUG
|
||||
select STACKDEPOT_ALWAYS_INIT if STACKTRACE_SUPPORT
|
||||
default n
|
||||
help
|
||||
Boot with debugging on by default. SLUB boots by default with
|
||||
the runtime debug capabilities switched off. Enabling this is
|
||||
equivalent to specifying the "slub_debug" parameter on boot.
|
||||
There is no support for more fine grained debug control like
|
||||
possible with slub_debug=xxx. SLUB debugging may be switched
|
||||
off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying
|
||||
"slub_debug=-".
|
||||
|
||||
config SLUB_STATS
|
||||
default n
|
||||
bool "Enable SLUB performance statistics"
|
||||
depends on SLUB && SYSFS
|
||||
help
|
||||
SLUB statistics are useful to debug SLUBs allocation behavior in
|
||||
order find ways to optimize the allocator. This should never be
|
||||
enabled for production use since keeping statistics slows down
|
||||
the allocator by a few percentage points. The slabinfo command
|
||||
supports the determination of the most active slabs to figure
|
||||
out which slabs are relevant to a particular load.
|
||||
Try running: slabinfo -DA
|
||||
|
||||
config HAVE_DEBUG_KMEMLEAK
|
||||
bool
|
||||
|
||||
|
|
56
mm/Kconfig
56
mm/Kconfig
|
@ -270,6 +270,19 @@ config SLAB_FREELIST_HARDENED
|
|||
sanity-checking than others. This option is most effective with
|
||||
CONFIG_SLUB.
|
||||
|
||||
config SLUB_STATS
|
||||
default n
|
||||
bool "Enable SLUB performance statistics"
|
||||
depends on SLUB && SYSFS
|
||||
help
|
||||
SLUB statistics are useful to debug SLUBs allocation behavior in
|
||||
order find ways to optimize the allocator. This should never be
|
||||
enabled for production use since keeping statistics slows down
|
||||
the allocator by a few percentage points. The slabinfo command
|
||||
supports the determination of the most active slabs to figure
|
||||
out which slabs are relevant to a particular load.
|
||||
Try running: slabinfo -DA
|
||||
|
||||
config SLUB_CPU_PARTIAL
|
||||
default y
|
||||
depends on SLUB && SMP
|
||||
|
@ -307,6 +320,40 @@ config SHUFFLE_PAGE_ALLOCATOR
|
|||
|
||||
Say Y if unsure.
|
||||
|
||||
config COMPAT_BRK
|
||||
bool "Disable heap randomization"
|
||||
default y
|
||||
help
|
||||
Randomizing heap placement makes heap exploits harder, but it
|
||||
also breaks ancient binaries (including anything libc5 based).
|
||||
This option changes the bootup default to heap randomization
|
||||
disabled, and can be overridden at runtime by setting
|
||||
/proc/sys/kernel/randomize_va_space to 2.
|
||||
|
||||
On non-ancient distros (post-2000 ones) N is usually a safe choice.
|
||||
|
||||
config MMAP_ALLOW_UNINITIALIZED
|
||||
bool "Allow mmapped anonymous memory to be uninitialized"
|
||||
depends on EXPERT && !MMU
|
||||
default n
|
||||
help
|
||||
Normally, and according to the Linux spec, anonymous memory obtained
|
||||
from mmap() has its contents cleared before it is passed to
|
||||
userspace. Enabling this config option allows you to request that
|
||||
mmap() skip that if it is given an MAP_UNINITIALIZED flag, thus
|
||||
providing a huge performance boost. If this option is not enabled,
|
||||
then the flag will be ignored.
|
||||
|
||||
This is taken advantage of by uClibc's malloc(), and also by
|
||||
ELF-FDPIC binfmt's brk and stack allocator.
|
||||
|
||||
Because of the obvious security issues, this option should only be
|
||||
enabled on embedded devices where you control what is run in
|
||||
userspace. Since that isn't generally a problem on no-MMU systems,
|
||||
it is normally safe to say Y here.
|
||||
|
||||
See Documentation/admin-guide/mm/nommu-mmap.rst for more information.
|
||||
|
||||
config SELECT_MEMORY_MODEL
|
||||
def_bool y
|
||||
depends on ARCH_SELECT_MEMORY_MODEL
|
||||
|
@ -964,6 +1011,15 @@ config ARCH_USES_HIGH_VMA_FLAGS
|
|||
config ARCH_HAS_PKEYS
|
||||
bool
|
||||
|
||||
config VM_EVENT_COUNTERS
|
||||
default y
|
||||
bool "Enable VM event counters for /proc/vmstat" if EXPERT
|
||||
help
|
||||
VM event counters are needed for event counts to be shown.
|
||||
This option allows the disabling of the VM event counters
|
||||
on EXPERT systems. /proc/vmstat will only show page counts
|
||||
if VM event counters are disabled.
|
||||
|
||||
config PERCPU_STATS
|
||||
bool "Collect percpu memory statistics"
|
||||
help
|
||||
|
|
|
@ -45,6 +45,39 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT
|
|||
Enable debug page memory allocations by default? This value
|
||||
can be overridden by debug_pagealloc=off|on.
|
||||
|
||||
config DEBUG_SLAB
|
||||
bool "Debug slab memory allocations"
|
||||
depends on DEBUG_KERNEL && SLAB
|
||||
help
|
||||
Say Y here to have the kernel do limited verification on memory
|
||||
allocation as well as poisoning memory on free to catch use of freed
|
||||
memory. This can make kmalloc/kfree-intensive workloads much slower.
|
||||
|
||||
config SLUB_DEBUG
|
||||
default y
|
||||
bool "Enable SLUB debugging support" if EXPERT
|
||||
depends on SLUB && SYSFS
|
||||
select STACKDEPOT if STACKTRACE_SUPPORT
|
||||
help
|
||||
SLUB has extensive debug support features. Disabling these can
|
||||
result in significant savings in code size. This also disables
|
||||
SLUB sysfs support. /sys/slab will not exist and there will be
|
||||
no support for cache validation etc.
|
||||
|
||||
config SLUB_DEBUG_ON
|
||||
bool "SLUB debugging on by default"
|
||||
depends on SLUB && SLUB_DEBUG
|
||||
select STACKDEPOT_ALWAYS_INIT if STACKTRACE_SUPPORT
|
||||
default n
|
||||
help
|
||||
Boot with debugging on by default. SLUB boots by default with
|
||||
the runtime debug capabilities switched off. Enabling this is
|
||||
equivalent to specifying the "slub_debug" parameter on boot.
|
||||
There is no support for more fine grained debug control like
|
||||
possible with slub_debug=xxx. SLUB debugging may be switched
|
||||
off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying
|
||||
"slub_debug=-".
|
||||
|
||||
config PAGE_OWNER
|
||||
bool "Track page owner"
|
||||
depends on DEBUG_KERNEL && STACKTRACE_SUPPORT
|
||||
|
|
|
@ -374,8 +374,8 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
|
|||
phys_addr_t min_addr,
|
||||
int nid, bool exact_nid);
|
||||
|
||||
void split_free_page(struct page *free_page,
|
||||
int order, unsigned long split_pfn_offset);
|
||||
int split_free_page(struct page *free_page,
|
||||
unsigned int order, unsigned long split_pfn_offset);
|
||||
|
||||
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
|
||||
|
||||
|
|
|
@ -347,7 +347,7 @@ static void print_address_description(void *addr, u8 tag)
|
|||
va->addr, va->addr + va->size, va->caller);
|
||||
pr_err("\n");
|
||||
|
||||
page = vmalloc_to_page(page);
|
||||
page = vmalloc_to_page(addr);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
18
mm/madvise.c
18
mm/madvise.c
|
@ -248,10 +248,13 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
|
|||
|
||||
if (!xa_is_value(page))
|
||||
continue;
|
||||
swap = radix_to_swp_entry(page);
|
||||
/* There might be swapin error entries in shmem mapping. */
|
||||
if (non_swap_entry(swap))
|
||||
continue;
|
||||
xas_pause(&xas);
|
||||
rcu_read_unlock();
|
||||
|
||||
swap = radix_to_swp_entry(page);
|
||||
page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
|
||||
NULL, 0, false, &splug);
|
||||
if (page)
|
||||
|
@ -624,11 +627,14 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
|
|||
swp_entry_t entry;
|
||||
|
||||
entry = pte_to_swp_entry(ptent);
|
||||
if (non_swap_entry(entry))
|
||||
continue;
|
||||
nr_swap--;
|
||||
free_swap_and_cache(entry);
|
||||
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
|
||||
if (!non_swap_entry(entry)) {
|
||||
nr_swap--;
|
||||
free_swap_and_cache(entry);
|
||||
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
|
||||
} else if (is_hwpoison_entry(entry) ||
|
||||
is_swapin_error_entry(entry)) {
|
||||
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
|
@ -1487,7 +1487,8 @@ again:
|
|||
/* Only drop the uffd-wp marker if explicitly requested */
|
||||
if (!zap_drop_file_uffd_wp(details))
|
||||
continue;
|
||||
} else if (is_hwpoison_entry(entry)) {
|
||||
} else if (is_hwpoison_entry(entry) ||
|
||||
is_swapin_error_entry(entry)) {
|
||||
if (!should_zap_cows(details))
|
||||
continue;
|
||||
} else {
|
||||
|
@ -3727,6 +3728,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
|
|||
ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
|
||||
} else if (is_hwpoison_entry(entry)) {
|
||||
ret = VM_FAULT_HWPOISON;
|
||||
} else if (is_swapin_error_entry(entry)) {
|
||||
ret = VM_FAULT_SIGBUS;
|
||||
} else if (is_pte_marker_entry(entry)) {
|
||||
ret = handle_pte_marker(vmf);
|
||||
} else {
|
||||
|
|
|
@ -482,8 +482,12 @@ unsigned long __get_pfnblock_flags_mask(const struct page *page,
|
|||
bitidx = pfn_to_bitidx(page, pfn);
|
||||
word_bitidx = bitidx / BITS_PER_LONG;
|
||||
bitidx &= (BITS_PER_LONG-1);
|
||||
|
||||
word = bitmap[word_bitidx];
|
||||
/*
|
||||
* This races, without locks, with set_pfnblock_flags_mask(). Ensure
|
||||
* a consistent read of the memory array, so that results, even though
|
||||
* racy, are not corrupted.
|
||||
*/
|
||||
word = READ_ONCE(bitmap[word_bitidx]);
|
||||
return (word >> bitidx) & mask;
|
||||
}
|
||||
|
||||
|
@ -1100,30 +1104,44 @@ done_merging:
|
|||
* @order: the order of the page
|
||||
* @split_pfn_offset: split offset within the page
|
||||
*
|
||||
* Return -ENOENT if the free page is changed, otherwise 0
|
||||
*
|
||||
* It is used when the free page crosses two pageblocks with different migratetypes
|
||||
* at split_pfn_offset within the page. The split free page will be put into
|
||||
* separate migratetype lists afterwards. Otherwise, the function achieves
|
||||
* nothing.
|
||||
*/
|
||||
void split_free_page(struct page *free_page,
|
||||
int order, unsigned long split_pfn_offset)
|
||||
int split_free_page(struct page *free_page,
|
||||
unsigned int order, unsigned long split_pfn_offset)
|
||||
{
|
||||
struct zone *zone = page_zone(free_page);
|
||||
unsigned long free_page_pfn = page_to_pfn(free_page);
|
||||
unsigned long pfn;
|
||||
unsigned long flags;
|
||||
int free_page_order;
|
||||
int mt;
|
||||
int ret = 0;
|
||||
|
||||
if (split_pfn_offset == 0)
|
||||
return;
|
||||
return ret;
|
||||
|
||||
spin_lock_irqsave(&zone->lock, flags);
|
||||
|
||||
if (!PageBuddy(free_page) || buddy_order(free_page) != order) {
|
||||
ret = -ENOENT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
mt = get_pageblock_migratetype(free_page);
|
||||
if (likely(!is_migrate_isolate(mt)))
|
||||
__mod_zone_freepage_state(zone, -(1UL << order), mt);
|
||||
|
||||
del_page_from_free_list(free_page, zone, order);
|
||||
for (pfn = free_page_pfn;
|
||||
pfn < free_page_pfn + (1UL << order);) {
|
||||
int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
|
||||
|
||||
free_page_order = min_t(int,
|
||||
free_page_order = min_t(unsigned int,
|
||||
pfn ? __ffs(pfn) : order,
|
||||
__fls(split_pfn_offset));
|
||||
__free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
|
||||
|
@ -1134,7 +1152,9 @@ void split_free_page(struct page *free_page,
|
|||
if (split_pfn_offset == 0)
|
||||
split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
|
||||
}
|
||||
out:
|
||||
spin_unlock_irqrestore(&zone->lock, flags);
|
||||
return ret;
|
||||
}
|
||||
/*
|
||||
* A bad page could be due to a number of fields. Instead of multiple branches,
|
||||
|
|
|
@ -300,7 +300,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
|
|||
* the in-use page then splitting the free page.
|
||||
*/
|
||||
static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
|
||||
gfp_t gfp_flags, bool isolate_before)
|
||||
gfp_t gfp_flags, bool isolate_before, bool skip_isolation)
|
||||
{
|
||||
unsigned char saved_mt;
|
||||
unsigned long start_pfn;
|
||||
|
@ -327,11 +327,16 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
|
|||
zone->zone_start_pfn);
|
||||
|
||||
saved_mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
|
||||
ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt, flags,
|
||||
isolate_pageblock, isolate_pageblock + pageblock_nr_pages);
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
if (skip_isolation)
|
||||
VM_BUG_ON(!is_migrate_isolate(saved_mt));
|
||||
else {
|
||||
ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt, flags,
|
||||
isolate_pageblock, isolate_pageblock + pageblock_nr_pages);
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Bail out early when the to-be-isolated pageblock does not form
|
||||
|
@ -366,9 +371,13 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
|
|||
if (PageBuddy(page)) {
|
||||
int order = buddy_order(page);
|
||||
|
||||
if (pfn + (1UL << order) > boundary_pfn)
|
||||
split_free_page(page, order, boundary_pfn - pfn);
|
||||
pfn += (1UL << order);
|
||||
if (pfn + (1UL << order) > boundary_pfn) {
|
||||
/* free page changed before split, check it again */
|
||||
if (split_free_page(page, order, boundary_pfn - pfn))
|
||||
continue;
|
||||
}
|
||||
|
||||
pfn += 1UL << order;
|
||||
continue;
|
||||
}
|
||||
/*
|
||||
|
@ -463,7 +472,8 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
|
|||
return 0;
|
||||
failed:
|
||||
/* restore the original migratetype */
|
||||
unset_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt);
|
||||
if (!skip_isolation)
|
||||
unset_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt);
|
||||
return -EBUSY;
|
||||
}
|
||||
|
||||
|
@ -522,14 +532,18 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
|
|||
unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages);
|
||||
unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages);
|
||||
int ret;
|
||||
bool skip_isolation = false;
|
||||
|
||||
/* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */
|
||||
ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false);
|
||||
ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false, skip_isolation);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (isolate_start == isolate_end - pageblock_nr_pages)
|
||||
skip_isolation = true;
|
||||
|
||||
/* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */
|
||||
ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true);
|
||||
ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true, skip_isolation);
|
||||
if (ret) {
|
||||
unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype);
|
||||
return ret;
|
||||
|
|
41
mm/shmem.c
41
mm/shmem.c
|
@ -1174,6 +1174,10 @@ static int shmem_find_swap_entries(struct address_space *mapping,
|
|||
continue;
|
||||
|
||||
entry = radix_to_swp_entry(folio);
|
||||
/*
|
||||
* swapin error entries can be found in the mapping. But they're
|
||||
* deliberately ignored here as we've done everything we can do.
|
||||
*/
|
||||
if (swp_type(entry) != type)
|
||||
continue;
|
||||
|
||||
|
@ -1671,6 +1675,36 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
|
|||
return error;
|
||||
}
|
||||
|
||||
static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
|
||||
struct folio *folio, swp_entry_t swap)
|
||||
{
|
||||
struct address_space *mapping = inode->i_mapping;
|
||||
struct shmem_inode_info *info = SHMEM_I(inode);
|
||||
swp_entry_t swapin_error;
|
||||
void *old;
|
||||
|
||||
swapin_error = make_swapin_error_entry(&folio->page);
|
||||
old = xa_cmpxchg_irq(&mapping->i_pages, index,
|
||||
swp_to_radix_entry(swap),
|
||||
swp_to_radix_entry(swapin_error), 0);
|
||||
if (old != swp_to_radix_entry(swap))
|
||||
return;
|
||||
|
||||
folio_wait_writeback(folio);
|
||||
delete_from_swap_cache(&folio->page);
|
||||
spin_lock_irq(&info->lock);
|
||||
/*
|
||||
* Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't
|
||||
* be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in
|
||||
* shmem_evict_inode.
|
||||
*/
|
||||
info->alloced--;
|
||||
info->swapped--;
|
||||
shmem_recalc_inode(inode);
|
||||
spin_unlock_irq(&info->lock);
|
||||
swap_free(swap);
|
||||
}
|
||||
|
||||
/*
|
||||
* Swap in the page pointed to by *pagep.
|
||||
* Caller has to make sure that *pagep contains a valid swapped page.
|
||||
|
@ -1694,6 +1728,9 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
|
|||
swap = radix_to_swp_entry(*foliop);
|
||||
*foliop = NULL;
|
||||
|
||||
if (is_swapin_error_entry(swap))
|
||||
return -EIO;
|
||||
|
||||
/* Look it up and read it in.. */
|
||||
page = lookup_swap_cache(swap, NULL, 0);
|
||||
if (!page) {
|
||||
|
@ -1761,6 +1798,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
|
|||
failed:
|
||||
if (!shmem_confirm_swap(mapping, index, swap))
|
||||
error = -EEXIST;
|
||||
if (error == -EIO)
|
||||
shmem_set_folio_swapin_error(inode, index, folio, swap);
|
||||
unlock:
|
||||
if (folio) {
|
||||
folio_unlock(folio);
|
||||
|
@ -1906,7 +1945,7 @@ alloc_nohuge:
|
|||
|
||||
spin_lock_irq(&info->lock);
|
||||
info->alloced += folio_nr_pages(folio);
|
||||
inode->i_blocks += BLOCKS_PER_PAGE << folio_order(folio);
|
||||
inode->i_blocks += (blkcnt_t)BLOCKS_PER_PAGE << folio_order(folio);
|
||||
shmem_recalc_inode(inode);
|
||||
spin_unlock_irq(&info->lock);
|
||||
alloced = true;
|
||||
|
|
|
@ -410,6 +410,9 @@ struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index)
|
|||
return NULL;
|
||||
|
||||
swp = radix_to_swp_entry(page);
|
||||
/* There might be swapin error entries in shmem mapping. */
|
||||
if (non_swap_entry(swp))
|
||||
return NULL;
|
||||
/* Prevent swapoff from happening to us */
|
||||
si = get_swap_device(swp);
|
||||
if (!si)
|
||||
|
|
|
@ -1775,7 +1775,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
|
|||
{
|
||||
struct page *swapcache;
|
||||
spinlock_t *ptl;
|
||||
pte_t *pte;
|
||||
pte_t *pte, new_pte;
|
||||
int ret = 1;
|
||||
|
||||
swapcache = page;
|
||||
|
@ -1789,6 +1789,17 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
|
|||
goto out;
|
||||
}
|
||||
|
||||
if (unlikely(!PageUptodate(page))) {
|
||||
pte_t pteval;
|
||||
|
||||
dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
|
||||
pteval = swp_entry_to_pte(make_swapin_error_entry(page));
|
||||
set_pte_at(vma->vm_mm, addr, pte, pteval);
|
||||
swap_free(entry);
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* See do_swap_page() */
|
||||
BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
|
||||
BUG_ON(PageAnon(page) && PageAnonExclusive(page));
|
||||
|
@ -1813,8 +1824,12 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
|
|||
page_add_new_anon_rmap(page, vma, addr);
|
||||
lru_cache_add_inactive_or_unevictable(page, vma);
|
||||
}
|
||||
set_pte_at(vma->vm_mm, addr, pte,
|
||||
pte_mkold(mk_pte(page, vma->vm_page_prot)));
|
||||
new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot));
|
||||
if (pte_swp_soft_dirty(*pte))
|
||||
new_pte = pte_mksoft_dirty(new_pte);
|
||||
if (pte_swp_uffd_wp(*pte))
|
||||
new_pte = pte_mkuffd_wp(new_pte);
|
||||
set_pte_at(vma->vm_mm, addr, pte, new_pte);
|
||||
swap_free(entry);
|
||||
out:
|
||||
pte_unmap_unlock(pte, ptl);
|
||||
|
|
97
mm/z3fold.c
97
mm/z3fold.c
|
@ -181,6 +181,7 @@ enum z3fold_page_flags {
|
|||
NEEDS_COMPACTING,
|
||||
PAGE_STALE,
|
||||
PAGE_CLAIMED, /* by either reclaim or free */
|
||||
PAGE_MIGRATED, /* page is migrated and soon to be released */
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -212,10 +213,8 @@ static int size_to_chunks(size_t size)
|
|||
static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
|
||||
gfp_t gfp)
|
||||
{
|
||||
struct z3fold_buddy_slots *slots;
|
||||
|
||||
slots = kmem_cache_zalloc(pool->c_handle,
|
||||
(gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE)));
|
||||
struct z3fold_buddy_slots *slots = kmem_cache_zalloc(pool->c_handle,
|
||||
gfp);
|
||||
|
||||
if (slots) {
|
||||
/* It will be freed separately in free_handle(). */
|
||||
|
@ -272,8 +271,13 @@ static inline struct z3fold_header *get_z3fold_header(unsigned long handle)
|
|||
zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
|
||||
locked = z3fold_page_trylock(zhdr);
|
||||
read_unlock(&slots->lock);
|
||||
if (locked)
|
||||
break;
|
||||
if (locked) {
|
||||
struct page *page = virt_to_page(zhdr);
|
||||
|
||||
if (!test_bit(PAGE_MIGRATED, &page->private))
|
||||
break;
|
||||
z3fold_page_unlock(zhdr);
|
||||
}
|
||||
cpu_relax();
|
||||
} while (true);
|
||||
} else {
|
||||
|
@ -391,6 +395,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
|
|||
clear_bit(NEEDS_COMPACTING, &page->private);
|
||||
clear_bit(PAGE_STALE, &page->private);
|
||||
clear_bit(PAGE_CLAIMED, &page->private);
|
||||
clear_bit(PAGE_MIGRATED, &page->private);
|
||||
if (headless)
|
||||
return zhdr;
|
||||
|
||||
|
@ -521,13 +526,6 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
|
|||
atomic64_dec(&pool->pages_nr);
|
||||
}
|
||||
|
||||
static void release_z3fold_page(struct kref *ref)
|
||||
{
|
||||
struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
|
||||
refcount);
|
||||
__release_z3fold_page(zhdr, false);
|
||||
}
|
||||
|
||||
static void release_z3fold_page_locked(struct kref *ref)
|
||||
{
|
||||
struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
|
||||
|
@ -940,10 +938,19 @@ lookup:
|
|||
}
|
||||
}
|
||||
|
||||
if (zhdr && !zhdr->slots)
|
||||
zhdr->slots = alloc_slots(pool,
|
||||
can_sleep ? GFP_NOIO : GFP_ATOMIC);
|
||||
if (zhdr && !zhdr->slots) {
|
||||
zhdr->slots = alloc_slots(pool, GFP_ATOMIC);
|
||||
if (!zhdr->slots)
|
||||
goto out_fail;
|
||||
}
|
||||
return zhdr;
|
||||
|
||||
out_fail:
|
||||
if (!kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
|
||||
add_to_unbuddied(pool, zhdr);
|
||||
z3fold_page_unlock(zhdr);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1066,7 +1073,7 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
|
|||
enum buddy bud;
|
||||
bool can_sleep = gfpflags_allow_blocking(gfp);
|
||||
|
||||
if (!size)
|
||||
if (!size || (gfp & __GFP_HIGHMEM))
|
||||
return -EINVAL;
|
||||
|
||||
if (size > PAGE_SIZE)
|
||||
|
@ -1093,28 +1100,7 @@ retry:
|
|||
bud = FIRST;
|
||||
}
|
||||
|
||||
page = NULL;
|
||||
if (can_sleep) {
|
||||
spin_lock(&pool->stale_lock);
|
||||
zhdr = list_first_entry_or_null(&pool->stale,
|
||||
struct z3fold_header, buddy);
|
||||
/*
|
||||
* Before allocating a page, let's see if we can take one from
|
||||
* the stale pages list. cancel_work_sync() can sleep so we
|
||||
* limit this case to the contexts where we can sleep
|
||||
*/
|
||||
if (zhdr) {
|
||||
list_del(&zhdr->buddy);
|
||||
spin_unlock(&pool->stale_lock);
|
||||
cancel_work_sync(&zhdr->work);
|
||||
page = virt_to_page(zhdr);
|
||||
} else {
|
||||
spin_unlock(&pool->stale_lock);
|
||||
}
|
||||
}
|
||||
if (!page)
|
||||
page = alloc_page(gfp);
|
||||
|
||||
page = alloc_page(gfp);
|
||||
if (!page)
|
||||
return -ENOMEM;
|
||||
|
||||
|
@ -1134,10 +1120,9 @@ retry:
|
|||
__SetPageMovable(page, pool->inode->i_mapping);
|
||||
unlock_page(page);
|
||||
} else {
|
||||
if (trylock_page(page)) {
|
||||
__SetPageMovable(page, pool->inode->i_mapping);
|
||||
unlock_page(page);
|
||||
}
|
||||
WARN_ON(!trylock_page(page));
|
||||
__SetPageMovable(page, pool->inode->i_mapping);
|
||||
unlock_page(page);
|
||||
}
|
||||
z3fold_page_lock(zhdr);
|
||||
|
||||
|
@ -1236,8 +1221,8 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
|
|||
return;
|
||||
}
|
||||
if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
|
||||
put_z3fold_header(zhdr);
|
||||
clear_bit(PAGE_CLAIMED, &page->private);
|
||||
put_z3fold_header(zhdr);
|
||||
return;
|
||||
}
|
||||
if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) {
|
||||
|
@ -1332,12 +1317,7 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
|
|||
break;
|
||||
}
|
||||
|
||||
if (kref_get_unless_zero(&zhdr->refcount) == 0) {
|
||||
zhdr = NULL;
|
||||
break;
|
||||
}
|
||||
if (!z3fold_page_trylock(zhdr)) {
|
||||
kref_put(&zhdr->refcount, release_z3fold_page);
|
||||
zhdr = NULL;
|
||||
continue; /* can't evict at this point */
|
||||
}
|
||||
|
@ -1348,14 +1328,14 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
|
|||
*/
|
||||
if (zhdr->foreign_handles ||
|
||||
test_and_set_bit(PAGE_CLAIMED, &page->private)) {
|
||||
if (!kref_put(&zhdr->refcount,
|
||||
release_z3fold_page_locked))
|
||||
z3fold_page_unlock(zhdr);
|
||||
z3fold_page_unlock(zhdr);
|
||||
zhdr = NULL;
|
||||
continue; /* can't evict such page */
|
||||
}
|
||||
list_del_init(&zhdr->buddy);
|
||||
zhdr->cpu = -1;
|
||||
/* See comment in __z3fold_alloc. */
|
||||
kref_get(&zhdr->refcount);
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -1437,8 +1417,10 @@ next:
|
|||
spin_lock(&pool->lock);
|
||||
list_add(&page->lru, &pool->lru);
|
||||
spin_unlock(&pool->lock);
|
||||
z3fold_page_unlock(zhdr);
|
||||
if (list_empty(&zhdr->buddy))
|
||||
add_to_unbuddied(pool, zhdr);
|
||||
clear_bit(PAGE_CLAIMED, &page->private);
|
||||
z3fold_page_unlock(zhdr);
|
||||
}
|
||||
|
||||
/* We started off locked to we need to lock the pool back */
|
||||
|
@ -1590,8 +1572,8 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
|
|||
if (!z3fold_page_trylock(zhdr))
|
||||
return -EAGAIN;
|
||||
if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) {
|
||||
z3fold_page_unlock(zhdr);
|
||||
clear_bit(PAGE_CLAIMED, &page->private);
|
||||
z3fold_page_unlock(zhdr);
|
||||
return -EBUSY;
|
||||
}
|
||||
if (work_pending(&zhdr->work)) {
|
||||
|
@ -1601,7 +1583,7 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
|
|||
new_zhdr = page_address(newpage);
|
||||
memcpy(new_zhdr, zhdr, PAGE_SIZE);
|
||||
newpage->private = page->private;
|
||||
page->private = 0;
|
||||
set_bit(PAGE_MIGRATED, &page->private);
|
||||
z3fold_page_unlock(zhdr);
|
||||
spin_lock_init(&new_zhdr->page_lock);
|
||||
INIT_WORK(&new_zhdr->work, compact_page_work);
|
||||
|
@ -1631,7 +1613,8 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
|
|||
|
||||
queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
|
||||
|
||||
clear_bit(PAGE_CLAIMED, &page->private);
|
||||
/* PAGE_CLAIMED and PAGE_MIGRATED are cleared now. */
|
||||
page->private = 0;
|
||||
put_page(page);
|
||||
return 0;
|
||||
}
|
||||
|
@ -1653,6 +1636,8 @@ static void z3fold_page_putback(struct page *page)
|
|||
spin_lock(&pool->lock);
|
||||
list_add(&page->lru, &pool->lru);
|
||||
spin_unlock(&pool->lock);
|
||||
if (list_empty(&zhdr->buddy))
|
||||
add_to_unbuddied(pool, zhdr);
|
||||
clear_bit(PAGE_CLAIMED, &page->private);
|
||||
z3fold_page_unlock(zhdr);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,89 @@
|
|||
% SPDX-License-Identifier: GPL-2.0
|
||||
%
|
||||
% run as: octave-cli memcg_protection.m
|
||||
%
|
||||
% This script simulates reclaim protection behavior on a single level of memcg
|
||||
% hierarchy to illustrate how overcommitted protection spreads among siblings
|
||||
% (as it depends also on their current consumption).
|
||||
%
|
||||
% Simulation assumes siblings consumed the initial amount of memory (w/out
|
||||
% reclaim) and then the reclaim starts, all memory is reclaimable, i.e. treated
|
||||
% same. It simulates only non-low reclaim and assumes all memory.min = 0.
|
||||
%
|
||||
% Input configurations
|
||||
% --------------------
|
||||
% E number parent effective protection
|
||||
% n vector nominal protection of siblings set at the given level (memory.low)
|
||||
% c vector current consumption -,,- (memory.current)
|
||||
|
||||
% example from testcase (values in GB)
|
||||
E = 50 / 1024;
|
||||
n = [75 25 0 500 ] / 1024;
|
||||
c = [50 50 50 0] / 1024;
|
||||
|
||||
% Reclaim parameters
|
||||
% ------------------
|
||||
|
||||
% Minimal reclaim amount (GB)
|
||||
cluster = 32*4 / 2**20;
|
||||
|
||||
% Reclaim coefficient (think as 0.5^sc->priority)
|
||||
alpha = .1
|
||||
|
||||
% Simulation parameters
|
||||
% ---------------------
|
||||
epsilon = 1e-7;
|
||||
timeout = 1000;
|
||||
|
||||
% Simulation loop
|
||||
% ---------------
|
||||
|
||||
ch = [];
|
||||
eh = [];
|
||||
rh = [];
|
||||
|
||||
for t = 1:timeout
|
||||
% low_usage
|
||||
u = min(c, n);
|
||||
siblings = sum(u);
|
||||
|
||||
% effective_protection()
|
||||
protected = min(n, c); % start with nominal
|
||||
e = protected * min(1, E / siblings); % normalize overcommit
|
||||
|
||||
% recursive protection
|
||||
unclaimed = max(0, E - siblings);
|
||||
parent_overuse = sum(c) - siblings;
|
||||
if (unclaimed > 0 && parent_overuse > 0)
|
||||
overuse = max(0, c - protected);
|
||||
e += unclaimed * (overuse / parent_overuse);
|
||||
endif
|
||||
|
||||
% get_scan_count()
|
||||
r = alpha * c; % assume all memory is in a single LRU list
|
||||
|
||||
% commit 1bc63fb1272b ("mm, memcg: make scan aggression always exclude protection")
|
||||
sz = max(e, c);
|
||||
r .*= (1 - (e+epsilon) ./ (sz+epsilon));
|
||||
|
||||
% uncomment to debug prints
|
||||
% e, c, r
|
||||
|
||||
% nothing to reclaim, reached equilibrium
|
||||
if max(r) < epsilon
|
||||
break;
|
||||
endif
|
||||
|
||||
% SWAP_CLUSTER_MAX roundup
|
||||
r = max(r, (r > epsilon) .* cluster);
|
||||
% XXX here I do parallel reclaim of all siblings
|
||||
% in reality reclaim is serialized and each sibling recalculates own residual
|
||||
c = max(c - r, 0);
|
||||
|
||||
ch = [ch ; c];
|
||||
eh = [eh ; e];
|
||||
rh = [rh ; r];
|
||||
endfor
|
||||
|
||||
t
|
||||
c, e
|
|
@ -190,13 +190,6 @@ cleanup:
|
|||
return ret;
|
||||
}
|
||||
|
||||
static int alloc_pagecache_50M(const char *cgroup, void *arg)
|
||||
{
|
||||
int fd = (long)arg;
|
||||
|
||||
return alloc_pagecache(fd, MB(50));
|
||||
}
|
||||
|
||||
static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg)
|
||||
{
|
||||
int fd = (long)arg;
|
||||
|
@ -247,33 +240,39 @@ static int cg_test_proc_killed(const char *cgroup)
|
|||
|
||||
/*
|
||||
* First, this test creates the following hierarchy:
|
||||
* A memory.min = 50M, memory.max = 200M
|
||||
* A/B memory.min = 50M, memory.current = 50M
|
||||
* A memory.min = 0, memory.max = 200M
|
||||
* A/B memory.min = 50M
|
||||
* A/B/C memory.min = 75M, memory.current = 50M
|
||||
* A/B/D memory.min = 25M, memory.current = 50M
|
||||
* A/B/E memory.min = 0, memory.current = 50M
|
||||
* A/B/F memory.min = 500M, memory.current = 0
|
||||
*
|
||||
* Usages are pagecache, but the test keeps a running
|
||||
* (or memory.low if we test soft protection)
|
||||
*
|
||||
* Usages are pagecache and the test keeps a running
|
||||
* process in every leaf cgroup.
|
||||
* Then it creates A/G and creates a significant
|
||||
* memory pressure in it.
|
||||
* memory pressure in A.
|
||||
*
|
||||
* Then it checks actual memory usages and expects that:
|
||||
* A/B memory.current ~= 50M
|
||||
* A/B/C memory.current ~= 33M
|
||||
* A/B/D memory.current ~= 17M
|
||||
* A/B/F memory.current ~= 0
|
||||
* A/B/C memory.current ~= 29M
|
||||
* A/B/D memory.current ~= 21M
|
||||
* A/B/E memory.current ~= 0
|
||||
* A/B/F memory.current = 0
|
||||
* (for origin of the numbers, see model in memcg_protection.m.)
|
||||
*
|
||||
* After that it tries to allocate more than there is
|
||||
* unprotected memory in A available, and checks
|
||||
* checks that memory.min protects pagecache even
|
||||
* in this case.
|
||||
* unprotected memory in A available, and checks that:
|
||||
* a) memory.min protects pagecache even in this case,
|
||||
* b) memory.low allows reclaiming page cache with low events.
|
||||
*/
|
||||
static int test_memcg_min(const char *root)
|
||||
static int test_memcg_protection(const char *root, bool min)
|
||||
{
|
||||
int ret = KSFT_FAIL;
|
||||
int ret = KSFT_FAIL, rc;
|
||||
char *parent[3] = {NULL};
|
||||
char *children[4] = {NULL};
|
||||
const char *attribute = min ? "memory.min" : "memory.low";
|
||||
long c[4];
|
||||
int i, attempts;
|
||||
int fd;
|
||||
|
@ -297,8 +296,10 @@ static int test_memcg_min(const char *root)
|
|||
if (cg_create(parent[0]))
|
||||
goto cleanup;
|
||||
|
||||
if (cg_read_long(parent[0], "memory.min")) {
|
||||
ret = KSFT_SKIP;
|
||||
if (cg_read_long(parent[0], attribute)) {
|
||||
/* No memory.min on older kernels is fine */
|
||||
if (min)
|
||||
ret = KSFT_SKIP;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
|
@ -335,17 +336,15 @@ static int test_memcg_min(const char *root)
|
|||
(void *)(long)fd);
|
||||
}
|
||||
|
||||
if (cg_write(parent[0], "memory.min", "50M"))
|
||||
if (cg_write(parent[1], attribute, "50M"))
|
||||
goto cleanup;
|
||||
if (cg_write(parent[1], "memory.min", "50M"))
|
||||
if (cg_write(children[0], attribute, "75M"))
|
||||
goto cleanup;
|
||||
if (cg_write(children[0], "memory.min", "75M"))
|
||||
if (cg_write(children[1], attribute, "25M"))
|
||||
goto cleanup;
|
||||
if (cg_write(children[1], "memory.min", "25M"))
|
||||
if (cg_write(children[2], attribute, "0"))
|
||||
goto cleanup;
|
||||
if (cg_write(children[2], "memory.min", "0"))
|
||||
goto cleanup;
|
||||
if (cg_write(children[3], "memory.min", "500M"))
|
||||
if (cg_write(children[3], attribute, "500M"))
|
||||
goto cleanup;
|
||||
|
||||
attempts = 0;
|
||||
|
@ -365,170 +364,35 @@ static int test_memcg_min(const char *root)
|
|||
for (i = 0; i < ARRAY_SIZE(children); i++)
|
||||
c[i] = cg_read_long(children[i], "memory.current");
|
||||
|
||||
if (!values_close(c[0], MB(33), 10))
|
||||
if (!values_close(c[0], MB(29), 10))
|
||||
goto cleanup;
|
||||
|
||||
if (!values_close(c[1], MB(17), 10))
|
||||
if (!values_close(c[1], MB(21), 10))
|
||||
goto cleanup;
|
||||
|
||||
if (c[3] != 0)
|
||||
goto cleanup;
|
||||
|
||||
if (!cg_run(parent[2], alloc_anon, (void *)MB(170)))
|
||||
rc = cg_run(parent[2], alloc_anon, (void *)MB(170));
|
||||
if (min && !rc)
|
||||
goto cleanup;
|
||||
|
||||
if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
|
||||
goto cleanup;
|
||||
|
||||
ret = KSFT_PASS;
|
||||
|
||||
cleanup:
|
||||
for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) {
|
||||
if (!children[i])
|
||||
continue;
|
||||
|
||||
cg_destroy(children[i]);
|
||||
free(children[i]);
|
||||
}
|
||||
|
||||
for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) {
|
||||
if (!parent[i])
|
||||
continue;
|
||||
|
||||
cg_destroy(parent[i]);
|
||||
free(parent[i]);
|
||||
}
|
||||
close(fd);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* First, this test creates the following hierarchy:
|
||||
* A memory.low = 50M, memory.max = 200M
|
||||
* A/B memory.low = 50M, memory.current = 50M
|
||||
* A/B/C memory.low = 75M, memory.current = 50M
|
||||
* A/B/D memory.low = 25M, memory.current = 50M
|
||||
* A/B/E memory.low = 0, memory.current = 50M
|
||||
* A/B/F memory.low = 500M, memory.current = 0
|
||||
*
|
||||
* Usages are pagecache.
|
||||
* Then it creates A/G an creates a significant
|
||||
* memory pressure in it.
|
||||
*
|
||||
* Then it checks actual memory usages and expects that:
|
||||
* A/B memory.current ~= 50M
|
||||
* A/B/ memory.current ~= 33M
|
||||
* A/B/D memory.current ~= 17M
|
||||
* A/B/F memory.current ~= 0
|
||||
*
|
||||
* After that it tries to allocate more than there is
|
||||
* unprotected memory in A available,
|
||||
* and checks low and oom events in memory.events.
|
||||
*/
|
||||
static int test_memcg_low(const char *root)
|
||||
{
|
||||
int ret = KSFT_FAIL;
|
||||
char *parent[3] = {NULL};
|
||||
char *children[4] = {NULL};
|
||||
long low, oom;
|
||||
long c[4];
|
||||
int i;
|
||||
int fd;
|
||||
|
||||
fd = get_temp_fd();
|
||||
if (fd < 0)
|
||||
goto cleanup;
|
||||
|
||||
parent[0] = cg_name(root, "memcg_test_0");
|
||||
if (!parent[0])
|
||||
goto cleanup;
|
||||
|
||||
parent[1] = cg_name(parent[0], "memcg_test_1");
|
||||
if (!parent[1])
|
||||
goto cleanup;
|
||||
|
||||
parent[2] = cg_name(parent[0], "memcg_test_2");
|
||||
if (!parent[2])
|
||||
goto cleanup;
|
||||
|
||||
if (cg_create(parent[0]))
|
||||
goto cleanup;
|
||||
|
||||
if (cg_read_long(parent[0], "memory.low"))
|
||||
goto cleanup;
|
||||
|
||||
if (cg_write(parent[0], "cgroup.subtree_control", "+memory"))
|
||||
goto cleanup;
|
||||
|
||||
if (cg_write(parent[0], "memory.max", "200M"))
|
||||
goto cleanup;
|
||||
|
||||
if (cg_write(parent[0], "memory.swap.max", "0"))
|
||||
goto cleanup;
|
||||
|
||||
if (cg_create(parent[1]))
|
||||
goto cleanup;
|
||||
|
||||
if (cg_write(parent[1], "cgroup.subtree_control", "+memory"))
|
||||
goto cleanup;
|
||||
|
||||
if (cg_create(parent[2]))
|
||||
goto cleanup;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(children); i++) {
|
||||
children[i] = cg_name_indexed(parent[1], "child_memcg", i);
|
||||
if (!children[i])
|
||||
goto cleanup;
|
||||
|
||||
if (cg_create(children[i]))
|
||||
goto cleanup;
|
||||
|
||||
if (i > 2)
|
||||
continue;
|
||||
|
||||
if (cg_run(children[i], alloc_pagecache_50M, (void *)(long)fd))
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (cg_write(parent[0], "memory.low", "50M"))
|
||||
goto cleanup;
|
||||
if (cg_write(parent[1], "memory.low", "50M"))
|
||||
goto cleanup;
|
||||
if (cg_write(children[0], "memory.low", "75M"))
|
||||
goto cleanup;
|
||||
if (cg_write(children[1], "memory.low", "25M"))
|
||||
goto cleanup;
|
||||
if (cg_write(children[2], "memory.low", "0"))
|
||||
goto cleanup;
|
||||
if (cg_write(children[3], "memory.low", "500M"))
|
||||
goto cleanup;
|
||||
|
||||
if (cg_run(parent[2], alloc_anon, (void *)MB(148)))
|
||||
goto cleanup;
|
||||
|
||||
if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
|
||||
goto cleanup;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(children); i++)
|
||||
c[i] = cg_read_long(children[i], "memory.current");
|
||||
|
||||
if (!values_close(c[0], MB(33), 10))
|
||||
goto cleanup;
|
||||
|
||||
if (!values_close(c[1], MB(17), 10))
|
||||
goto cleanup;
|
||||
|
||||
if (c[3] != 0)
|
||||
goto cleanup;
|
||||
|
||||
if (cg_run(parent[2], alloc_anon, (void *)MB(166))) {
|
||||
else if (!min && rc) {
|
||||
fprintf(stderr,
|
||||
"memory.low prevents from allocating anon memory\n");
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
|
||||
goto cleanup;
|
||||
|
||||
if (min) {
|
||||
ret = KSFT_PASS;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(children); i++) {
|
||||
int no_low_events_index = has_recursiveprot ? 2 : 1;
|
||||
int no_low_events_index = 1;
|
||||
long low, oom;
|
||||
|
||||
oom = cg_read_key_long(children[i], "memory.events", "oom ");
|
||||
low = cg_read_key_long(children[i], "memory.events", "low ");
|
||||
|
@ -564,6 +428,16 @@ cleanup:
|
|||
return ret;
|
||||
}
|
||||
|
||||
static int test_memcg_min(const char *root)
|
||||
{
|
||||
return test_memcg_protection(root, true);
|
||||
}
|
||||
|
||||
static int test_memcg_low(const char *root)
|
||||
{
|
||||
return test_memcg_protection(root, false);
|
||||
}
|
||||
|
||||
static int alloc_pagecache_max_30M(const char *cgroup, void *arg)
|
||||
{
|
||||
size_t size = MB(50);
|
||||
|
@ -1241,7 +1115,16 @@ static int test_memcg_oom_group_leaf_events(const char *root)
|
|||
if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0)
|
||||
goto cleanup;
|
||||
|
||||
if (cg_read_key_long(parent, "memory.events", "oom_kill ") <= 0)
|
||||
parent_oom_events = cg_read_key_long(
|
||||
parent, "memory.events", "oom_kill ");
|
||||
/*
|
||||
* If memory_localevents is not enabled (the default), the parent should
|
||||
* count OOM events in its children groups. Otherwise, it should not
|
||||
* have observed any events.
|
||||
*/
|
||||
if (has_localevents && parent_oom_events != 0)
|
||||
goto cleanup;
|
||||
else if (!has_localevents && parent_oom_events <= 0)
|
||||
goto cleanup;
|
||||
|
||||
ret = KSFT_PASS;
|
||||
|
@ -1349,20 +1232,14 @@ static int test_memcg_oom_group_score_events(const char *root)
|
|||
if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
|
||||
goto cleanup;
|
||||
|
||||
parent_oom_events = cg_read_key_long(
|
||||
parent, "memory.events", "oom_kill ");
|
||||
/*
|
||||
* If memory_localevents is not enabled (the default), the parent should
|
||||
* count OOM events in its children groups. Otherwise, it should not
|
||||
* have observed any events.
|
||||
*/
|
||||
if ((has_localevents && parent_oom_events == 0) ||
|
||||
parent_oom_events > 0)
|
||||
ret = KSFT_PASS;
|
||||
if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3)
|
||||
goto cleanup;
|
||||
|
||||
if (kill(safe_pid, SIGKILL))
|
||||
goto cleanup;
|
||||
|
||||
ret = KSFT_PASS;
|
||||
|
||||
cleanup:
|
||||
if (memcg)
|
||||
cg_destroy(memcg);
|
||||
|
|
Loading…
Reference in New Issue