mm/mglru: fix ineffective protection calculation

commit 30d77b7eef019fa4422980806e8b7cdc8674493e upstream.

mem_cgroup_calculate_protection() is not stateless and should only be used
as part of a top-down tree traversal.  shrink_one() traverses the per-node
memcg LRU instead of the root_mem_cgroup tree, and therefore it should not
call mem_cgroup_calculate_protection().

The existing misuse in shrink_one() can cause ineffective protection of
sub-trees that are grandchildren of root_mem_cgroup.  Fix it by reusing
lru_gen_age_node(), which already traverses the root_mem_cgroup tree, to
calculate the protection.

Previously lru_gen_age_node() opportunistically skips the first pass,
i.e., when scan_control->priority is DEF_PRIORITY.  On the second pass,
lruvec_is_sizable() uses appropriate scan_control->priority, set by
set_initial_priority() from lru_gen_shrink_node(), to decide whether a
memcg is too small to reclaim from.

Now lru_gen_age_node() unconditionally traverses the root_mem_cgroup tree.
So it should call set_initial_priority() upfront, to make sure
lruvec_is_sizable() uses appropriate scan_control->priority on the first
pass.  Otherwise, lruvec_is_reclaimable() can return false negatives and
result in premature OOM kills when min_ttl_ms is used.

Link: https://lkml.kernel.org/r/20240712232956.1427127-1-yuzhao@google.com
Fixes: e4dde56cd2 ("mm: multi-gen LRU: per-node lru_gen_folio lists")
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reported-by: T.J. Mercier <tjmercier@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
This commit is contained in:
Yu Zhao 2024-07-12 17:29:56 -06:00 committed by Greg Kroah-Hartman
parent be56dfc9be
commit 4211d065ef
1 changed files with 38 additions and 45 deletions

View File

@ -4546,6 +4546,32 @@ done:
* working set protection
******************************************************************************/
static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
{
int priority;
unsigned long reclaimable;
if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
return;
/*
* Determine the initial priority based on
* (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
* where reclaimed_to_scanned_ratio = inactive / total.
*/
reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
/* round down reclaimable and round up sc->nr_to_reclaim */
priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
/*
* The estimation is based on LRU pages only, so cap it to prevent
* overshoots of shrinker objects by large margins.
*/
sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY);
}
static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
{
int gen, type, zone;
@ -4579,19 +4605,17 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MIN_SEQ(lruvec);
/* see the comment on lru_gen_folio */
gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
if (time_is_after_jiffies(birth + min_ttl))
if (mem_cgroup_below_min(NULL, memcg))
return false;
if (!lruvec_is_sizable(lruvec, sc))
return false;
mem_cgroup_calculate_protection(NULL, memcg);
/* see the comment on lru_gen_folio */
gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
return !mem_cgroup_below_min(NULL, memcg);
return time_is_before_jiffies(birth + min_ttl);
}
/* to protect the working set of the last N jiffies */
@ -4601,23 +4625,20 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
{
struct mem_cgroup *memcg;
unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
bool reclaimable = !min_ttl;
VM_WARN_ON_ONCE(!current_is_kswapd());
/* check the order to exclude compaction-induced reclaim */
if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
return;
set_initial_priority(pgdat, sc);
memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) {
mem_cgroup_iter_break(NULL, memcg);
return;
}
mem_cgroup_calculate_protection(NULL, memcg);
cond_resched();
if (!reclaimable)
reclaimable = lruvec_is_reclaimable(lruvec, sc, min_ttl);
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
/*
@ -4625,7 +4646,7 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
* younger than min_ttl. However, another possibility is all memcgs are
* either too small or below min.
*/
if (mutex_trylock(&oom_lock)) {
if (!reclaimable && mutex_trylock(&oom_lock)) {
struct oom_control oc = {
.gfp_mask = sc->gfp_mask,
};
@ -5424,8 +5445,7 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
mem_cgroup_calculate_protection(NULL, memcg);
/* lru_gen_age_node() called mem_cgroup_calculate_protection() */
if (mem_cgroup_below_min(NULL, memcg))
return MEMCG_LRU_YOUNG;
@ -5565,33 +5585,6 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
#endif
static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
{
int priority;
unsigned long reclaimable;
struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
return;
/*
* Determine the initial priority based on
* (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
* where reclaimed_to_scanned_ratio = inactive / total.
*/
reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
if (get_swappiness(lruvec, sc))
reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
/* round down reclaimable and round up sc->nr_to_reclaim */
priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
/*
* The estimation is based on LRU pages only, so cap it to prevent
* overshoots of shrinker objects by large margins.
*/
sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY);
}
static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
{
struct blk_plug plug;