mm/mglru: fix ineffective protection calculation
commit 30d77b7eef019fa4422980806e8b7cdc8674493e upstream.
mem_cgroup_calculate_protection() is not stateless and should only be used
as part of a top-down tree traversal. shrink_one() traverses the per-node
memcg LRU instead of the root_mem_cgroup tree, and therefore it should not
call mem_cgroup_calculate_protection().
The existing misuse in shrink_one() can cause ineffective protection of
sub-trees that are grandchildren of root_mem_cgroup. Fix it by reusing
lru_gen_age_node(), which already traverses the root_mem_cgroup tree, to
calculate the protection.
Previously lru_gen_age_node() opportunistically skips the first pass,
i.e., when scan_control->priority is DEF_PRIORITY. On the second pass,
lruvec_is_sizable() uses appropriate scan_control->priority, set by
set_initial_priority() from lru_gen_shrink_node(), to decide whether a
memcg is too small to reclaim from.
Now lru_gen_age_node() unconditionally traverses the root_mem_cgroup tree.
So it should call set_initial_priority() upfront, to make sure
lruvec_is_sizable() uses appropriate scan_control->priority on the first
pass. Otherwise, lruvec_is_reclaimable() can return false negatives and
result in premature OOM kills when min_ttl_ms is used.
Link: https://lkml.kernel.org/r/20240712232956.1427127-1-yuzhao@google.com
Fixes: e4dde56cd2
("mm: multi-gen LRU: per-node lru_gen_folio lists")
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reported-by: T.J. Mercier <tjmercier@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
This commit is contained in:
parent
be56dfc9be
commit
4211d065ef
83
mm/vmscan.c
83
mm/vmscan.c
|
@ -4546,6 +4546,32 @@ done:
|
|||
* working set protection
|
||||
******************************************************************************/
|
||||
|
||||
static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
|
||||
{
|
||||
int priority;
|
||||
unsigned long reclaimable;
|
||||
|
||||
if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
|
||||
return;
|
||||
/*
|
||||
* Determine the initial priority based on
|
||||
* (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
|
||||
* where reclaimed_to_scanned_ratio = inactive / total.
|
||||
*/
|
||||
reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
|
||||
if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
|
||||
reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
|
||||
|
||||
/* round down reclaimable and round up sc->nr_to_reclaim */
|
||||
priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
|
||||
|
||||
/*
|
||||
* The estimation is based on LRU pages only, so cap it to prevent
|
||||
* overshoots of shrinker objects by large margins.
|
||||
*/
|
||||
sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY);
|
||||
}
|
||||
|
||||
static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
|
||||
{
|
||||
int gen, type, zone;
|
||||
|
@ -4579,19 +4605,17 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc
|
|||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
DEFINE_MIN_SEQ(lruvec);
|
||||
|
||||
/* see the comment on lru_gen_folio */
|
||||
gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
|
||||
birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
|
||||
|
||||
if (time_is_after_jiffies(birth + min_ttl))
|
||||
if (mem_cgroup_below_min(NULL, memcg))
|
||||
return false;
|
||||
|
||||
if (!lruvec_is_sizable(lruvec, sc))
|
||||
return false;
|
||||
|
||||
mem_cgroup_calculate_protection(NULL, memcg);
|
||||
/* see the comment on lru_gen_folio */
|
||||
gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
|
||||
birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
|
||||
|
||||
return !mem_cgroup_below_min(NULL, memcg);
|
||||
return time_is_before_jiffies(birth + min_ttl);
|
||||
}
|
||||
|
||||
/* to protect the working set of the last N jiffies */
|
||||
|
@ -4601,23 +4625,20 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
|
|||
{
|
||||
struct mem_cgroup *memcg;
|
||||
unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
|
||||
bool reclaimable = !min_ttl;
|
||||
|
||||
VM_WARN_ON_ONCE(!current_is_kswapd());
|
||||
|
||||
/* check the order to exclude compaction-induced reclaim */
|
||||
if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
|
||||
return;
|
||||
set_initial_priority(pgdat, sc);
|
||||
|
||||
memcg = mem_cgroup_iter(NULL, NULL, NULL);
|
||||
do {
|
||||
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
|
||||
|
||||
if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) {
|
||||
mem_cgroup_iter_break(NULL, memcg);
|
||||
return;
|
||||
}
|
||||
mem_cgroup_calculate_protection(NULL, memcg);
|
||||
|
||||
cond_resched();
|
||||
if (!reclaimable)
|
||||
reclaimable = lruvec_is_reclaimable(lruvec, sc, min_ttl);
|
||||
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
||||
|
||||
/*
|
||||
|
@ -4625,7 +4646,7 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
|
|||
* younger than min_ttl. However, another possibility is all memcgs are
|
||||
* either too small or below min.
|
||||
*/
|
||||
if (mutex_trylock(&oom_lock)) {
|
||||
if (!reclaimable && mutex_trylock(&oom_lock)) {
|
||||
struct oom_control oc = {
|
||||
.gfp_mask = sc->gfp_mask,
|
||||
};
|
||||
|
@ -5424,8 +5445,7 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
|
|||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
||||
|
||||
mem_cgroup_calculate_protection(NULL, memcg);
|
||||
|
||||
/* lru_gen_age_node() called mem_cgroup_calculate_protection() */
|
||||
if (mem_cgroup_below_min(NULL, memcg))
|
||||
return MEMCG_LRU_YOUNG;
|
||||
|
||||
|
@ -5565,33 +5585,6 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
|
|||
|
||||
#endif
|
||||
|
||||
static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
|
||||
{
|
||||
int priority;
|
||||
unsigned long reclaimable;
|
||||
struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
|
||||
|
||||
if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
|
||||
return;
|
||||
/*
|
||||
* Determine the initial priority based on
|
||||
* (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
|
||||
* where reclaimed_to_scanned_ratio = inactive / total.
|
||||
*/
|
||||
reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
|
||||
if (get_swappiness(lruvec, sc))
|
||||
reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
|
||||
|
||||
/* round down reclaimable and round up sc->nr_to_reclaim */
|
||||
priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
|
||||
|
||||
/*
|
||||
* The estimation is based on LRU pages only, so cap it to prevent
|
||||
* overshoots of shrinker objects by large margins.
|
||||
*/
|
||||
sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY);
|
||||
}
|
||||
|
||||
static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
|
||||
{
|
||||
struct blk_plug plug;
|
||||
|
|
Loading…
Reference in New Issue