OpenCloudOS-Kernel/mm/memcontrol.c

11046 lines
281 KiB
C

// SPDX-License-Identifier: GPL-2.0-or-later
/* memcontrol.c - Memory Controller
*
* Copyright IBM Corporation, 2007
* Author Balbir Singh <balbir@linux.vnet.ibm.com>
*
* Copyright 2007 OpenVZ SWsoft Inc
* Author: Pavel Emelianov <xemul@openvz.org>
*
* Memory thresholds
* Copyright (C) 2009 Nokia Corporation
* Author: Kirill A. Shutemov
*
* Kernel Memory Controller
* Copyright (C) 2012 Parallels Inc. and Google Inc.
* Authors: Glauber Costa and Suleiman Souhlal
*
* Native page reclaim
* Charge lifetime sanitation
* Lockless page tracking & accounting
* Unified hierarchy configuration model
* Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
*
* Per memcg lru locking
* Copyright (C) 2020 Alibaba, Inc, Alex Shi
*/
#include <linux/page_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
#include <linux/pagewalk.h>
#include <linux/sched/mm.h>
#include <linux/psi.h>
#include <linux/shmem_fs.h>
#include <linux/hugetlb.h>
#include <linux/pagemap.h>
#include <linux/vm_event_item.h>
#include <linux/smp.h>
#include <linux/page-flags.h>
#include <linux/backing-dev.h>
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
#include <linux/limits.h>
#include <linux/export.h>
#include <linux/mutex.h>
#include <linux/rbtree.h>
#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/spinlock.h>
#include <linux/eventfd.h>
#include <linux/poll.h>
#include <linux/sort.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/vmpressure.h>
#include <linux/mm_inline.h>
#include <linux/swap_cgroup.h>
#include <linux/cpu.h>
#include <linux/oom.h>
#include <linux/lockdep.h>
#include <linux/file.h>
#include <linux/tracehook.h>
#include <linux/psi.h>
#include <linux/seq_buf.h>
#include <linux/namei.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
#include "slab.h"
#include <linux/sli.h>
#include <linux/uaccess.h>
#include <linux/sli.h>
#include <trace/events/vmscan.h>
struct cgroup_subsys memory_cgrp_subsys __read_mostly;
EXPORT_SYMBOL(memory_cgrp_subsys);
struct mem_cgroup *root_mem_cgroup __read_mostly;
#define MEMCG_PAGECACHE_RETRIES 20
#define MEM_CGROUP_RECLAIM_RETRIES 5
#ifdef CONFIG_MEM_QOS
#define DEFAULT_PAGE_RECLAIM_RATIO 5
#endif
#define PAGECACHE_MAX_RATIO_MIN 5
#define PAGECACHE_MAX_RATIO_MAX 100
/* Socket memory accounting disabled? */
static bool cgroup_memory_nosocket;
/* Kernel memory accounting disabled? */
bool cgroup_memory_nokmem = IS_ENABLED(CONFIG_MEMCG_KMEM_DEFAULT_OFF);
/* Whether the swap controller is active */
#ifdef CONFIG_MEMCG_SWAP
bool cgroup_memory_noswap __read_mostly;
#else
#define cgroup_memory_noswap 1
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
#endif
#ifdef CONFIG_MEM_QOS
unsigned int sysctl_clean_dying_memcg_async;
unsigned int sysctl_clean_dying_memcg_threshold = 100;
static struct task_struct *kclean_dying_memcg;
DECLARE_WAIT_QUEUE_HEAD(kclean_dying_memcg_wq);
int sysctl_vm_memory_qos;
/* default has none reclaim priority */
int sysctl_vm_qos_highest_reclaim_prio = CGROUP_PRIORITY_MAX;
static unsigned long rmem_wmark_limit;
static unsigned long rmem_wmark_setpoint;
static unsigned long rmem_wmark_freerun;
static long memcg_pos_ratio;
static atomic_long_t memcg_allocated_count;
static atomic_long_t memcg_reclaimed_count;
static unsigned long memcg_reclaim_goal;
static int memcg_cur_reclaim_prio = CGROUP_PRIORITY_MAX;
static DEFINE_SPINLOCK(memcg_reclaim_prio_lock);
/* workqueue for async reclaim */
struct workqueue_struct *memcg_async_reclaim_wq;
#define ASYNC_DISTANCE_DIV 1000000
#define ASYNC_RATIO_DIV 100
#define ASYNC_DISTANCE_DEF 1
#else
#define sysctl_vm_memory_qos false
#endif
/* Whether legacy memory+swap accounting is active */
static bool do_memsw_account(void)
{
return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
}
#define THRESHOLDS_EVENTS_TARGET 128
#define SOFTLIMIT_EVENTS_TARGET 1024
#define NUMAINFO_EVENTS_TARGET 1024
/*
* Cgroups above their limits are maintained in a RB-Tree, independent of
* their hierarchy representation
*/
struct mem_cgroup_tree_per_node {
struct rb_root rb_root;
struct rb_node *rb_rightmost;
spinlock_t lock;
};
struct mem_cgroup_tree {
struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
};
static struct mem_cgroup_tree soft_limit_tree __read_mostly;
/* for OOM */
struct mem_cgroup_eventfd_list {
struct list_head list;
struct eventfd_ctx *eventfd;
};
/*
* cgroup_event represents events which userspace want to receive.
*/
struct mem_cgroup_event {
/*
* memcg which the event belongs to.
*/
struct mem_cgroup *memcg;
/*
* eventfd to signal userspace about the event.
*/
struct eventfd_ctx *eventfd;
/*
* Each of these stored in a list by the cgroup.
*/
struct list_head list;
/*
* register_event() callback will be used to add new userspace
* waiter for changes related to this event. Use eventfd_signal()
* on eventfd to send notification to userspace.
*/
int (*register_event)(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd, const char *args);
/*
* unregister_event() callback will be called when userspace closes
* the eventfd or on cgroup removing. This callback must be set,
* if you want provide notification functionality.
*/
void (*unregister_event)(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd);
/*
* All fields below needed to unregister event when
* userspace closes eventfd.
*/
poll_table pt;
wait_queue_head_t *wqh;
wait_queue_entry_t wait;
struct work_struct remove;
};
static void mem_cgroup_threshold(struct mem_cgroup *memcg);
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
/* Stuffs for move charges at task migration. */
/*
* Types of charges to be moved.
*/
#define MOVE_ANON 0x1U
#define MOVE_FILE 0x2U
#define MOVE_MASK (MOVE_ANON | MOVE_FILE)
/* "mc" and its members are protected by cgroup_mutex */
static struct move_charge_struct {
spinlock_t lock; /* for from, to */
struct mm_struct *mm;
struct mem_cgroup *from;
struct mem_cgroup *to;
unsigned long flags;
unsigned long precharge;
unsigned long moved_charge;
unsigned long moved_swap;
struct task_struct *moving_task; /* a task moving charges */
wait_queue_head_t waitq; /* a waitq for other context */
} mc = {
.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
};
/*
* Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
* limit reclaim to prevent infinite loops, if they ever occur.
*/
#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
/* for encoding cft->private value on file */
enum res_type {
_MEM,
_MEMSWAP,
_OOM_TYPE,
_KMEM,
_TCP,
};
#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)
/* Used for OOM nofiier */
#define OOM_CONTROL (0)
#ifndef CONFIG_MEM_QOS
/*
* Iteration constructs for visiting all cgroups (under a tree). If
* loops are exited prematurely (break), mem_cgroup_iter_break() must
* be used for reference counting.
*/
#define for_each_mem_cgroup_tree(iter, root) \
for (iter = mem_cgroup_iter(root, NULL, NULL); \
iter != NULL; \
iter = mem_cgroup_iter(root, iter, NULL))
#define for_each_mem_cgroup(iter) \
for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
#endif
static inline bool should_force_charge(void)
{
return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
(current->flags & PF_EXITING);
}
/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
if (!memcg)
memcg = root_mem_cgroup;
return &memcg->vmpressure;
}
struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
{
return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
}
#ifdef CONFIG_MEMCG_KMEM
static DEFINE_SPINLOCK(objcg_lock);
static void obj_cgroup_release(struct percpu_ref *ref)
{
struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
struct mem_cgroup *memcg;
unsigned int nr_bytes;
unsigned int nr_pages;
unsigned long flags;
/*
* At this point all allocated objects are freed, and
* objcg->nr_charged_bytes can't have an arbitrary byte value.
* However, it can be PAGE_SIZE or (x * PAGE_SIZE).
*
* The following sequence can lead to it:
* 1) CPU0: objcg == stock->cached_objcg
* 2) CPU1: we do a small allocation (e.g. 92 bytes),
* PAGE_SIZE bytes are charged
* 3) CPU1: a process from another memcg is allocating something,
* the stock if flushed,
* objcg->nr_charged_bytes = PAGE_SIZE - 92
* 5) CPU0: we do release this object,
* 92 bytes are added to stock->nr_bytes
* 6) CPU0: stock is flushed,
* 92 bytes are added to objcg->nr_charged_bytes
*
* In the result, nr_charged_bytes == PAGE_SIZE.
* This page will be uncharged in obj_cgroup_release().
*/
nr_bytes = atomic_read(&objcg->nr_charged_bytes);
WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
nr_pages = nr_bytes >> PAGE_SHIFT;
spin_lock_irqsave(&objcg_lock, flags);
memcg = obj_cgroup_memcg(objcg);
if (nr_pages)
__memcg_kmem_uncharge(memcg, nr_pages);
list_del(&objcg->list);
spin_unlock_irqrestore(&objcg_lock, flags);
mem_cgroup_put(memcg);
percpu_ref_exit(ref);
kfree_rcu(objcg, rcu);
}
static struct obj_cgroup *obj_cgroup_alloc(void)
{
struct obj_cgroup *objcg;
int ret;
objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
if (!objcg)
return NULL;
ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
GFP_KERNEL);
if (ret) {
kfree(objcg);
return NULL;
}
INIT_LIST_HEAD(&objcg->list);
return objcg;
}
static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
struct mem_cgroup *parent)
{
struct obj_cgroup *objcg, *iter;
objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
spin_lock_irq(&objcg_lock);
/* Move active objcg to the parent's list */
xchg(&objcg->memcg, parent);
css_get(&parent->css);
list_add(&objcg->list, &parent->objcg_list);
/* Move already reparented objcgs to the parent's list */
list_for_each_entry(iter, &memcg->objcg_list, list) {
css_get(&parent->css);
xchg(&iter->memcg, parent);
css_put(&memcg->css);
}
list_splice(&memcg->objcg_list, &parent->objcg_list);
spin_unlock_irq(&objcg_lock);
percpu_ref_kill(&objcg->refcnt);
}
/*
* This will be used as a shrinker list's index.
* The main reason for not using cgroup id for this:
* this works better in sparse environments, where we have a lot of memcgs,
* but only a few kmem-limited. Or also, if we have, for instance, 200
* memcgs, and none but the 200th is kmem-limited, we'd have to have a
* 200 entry array for that.
*
* The current size of the caches array is stored in memcg_nr_cache_ids. It
* will double each time we have to increase it.
*/
static DEFINE_IDA(memcg_cache_ida);
int memcg_nr_cache_ids;
/* Protects memcg_nr_cache_ids */
static DECLARE_RWSEM(memcg_cache_ids_sem);
void memcg_get_cache_ids(void)
{
down_read(&memcg_cache_ids_sem);
}
void memcg_put_cache_ids(void)
{
up_read(&memcg_cache_ids_sem);
}
/*
* MIN_SIZE is different than 1, because we would like to avoid going through
* the alloc/free process all the time. In a small machine, 4 kmem-limited
* cgroups is a reasonable guess. In the future, it could be a parameter or
* tunable, but that is strictly not necessary.
*
* MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
* this constant directly from cgroup, but it is understandable that this is
* better kept as an internal representation in cgroup.c. In any case, the
* cgrp_id space is not getting any smaller, and we don't have to necessarily
* increase ours as well if it increases.
*/
#define MEMCG_CACHES_MIN_SIZE 4
#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
/*
* A lot of the calls to the cache allocation functions are expected to be
* inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
* conditional to this static branch, we'll have to allow modules that does
* kmem_cache_alloc and the such to see this symbol as well
*/
DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
EXPORT_SYMBOL(memcg_kmem_enabled_key);
#endif
static int memcg_shrinker_map_size;
static DEFINE_MUTEX(memcg_shrinker_map_mutex);
static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
{
kvfree(container_of(head, struct memcg_shrinker_map, rcu));
}
static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
int size, int old_size)
{
struct memcg_shrinker_map *new, *old;
int nid;
lockdep_assert_held(&memcg_shrinker_map_mutex);
for_each_node(nid) {
old = rcu_dereference_protected(
mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true);
/* Not yet online memcg */
if (!old)
return 0;
new = kvmalloc(sizeof(*new) + size, GFP_KERNEL);
if (!new)
return -ENOMEM;
/* Set all old bits, clear all new bits */
memset(new->map, (int)0xff, old_size);
memset((void *)new->map + old_size, 0, size - old_size);
rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
}
return 0;
}
static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
{
struct mem_cgroup_per_node *pn;
struct memcg_shrinker_map *map;
int nid;
if (mem_cgroup_is_root(memcg))
return;
for_each_node(nid) {
pn = mem_cgroup_nodeinfo(memcg, nid);
map = rcu_dereference_protected(pn->shrinker_map, true);
if (map)
kvfree(map);
rcu_assign_pointer(pn->shrinker_map, NULL);
}
}
static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
{
struct memcg_shrinker_map *map;
int nid, size, ret = 0;
if (mem_cgroup_is_root(memcg))
return 0;
mutex_lock(&memcg_shrinker_map_mutex);
size = memcg_shrinker_map_size;
for_each_node(nid) {
map = kvzalloc(sizeof(*map) + size, GFP_KERNEL);
if (!map) {
memcg_free_shrinker_maps(memcg);
ret = -ENOMEM;
break;
}
rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
}
mutex_unlock(&memcg_shrinker_map_mutex);
return ret;
}
int memcg_expand_shrinker_maps(int new_id)
{
int size, old_size, ret = 0;
struct mem_cgroup *memcg;
size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
old_size = memcg_shrinker_map_size;
if (size <= old_size)
return 0;
mutex_lock(&memcg_shrinker_map_mutex);
if (!root_mem_cgroup)
goto unlock;
for_each_mem_cgroup(memcg) {
if (mem_cgroup_is_root(memcg))
continue;
ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
if (ret) {
mem_cgroup_iter_break(NULL, memcg);
goto unlock;
}
}
unlock:
if (!ret)
memcg_shrinker_map_size = size;
mutex_unlock(&memcg_shrinker_map_mutex);
return ret;
}
void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
{
if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
struct memcg_shrinker_map *map;
rcu_read_lock();
map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
/* Pairs with smp mb in shrink_slab() */
smp_mb__before_atomic();
set_bit(shrinker_id, map->map);
rcu_read_unlock();
}
}
/**
* mem_cgroup_css_from_page - css of the memcg associated with a page
* @page: page of interest
*
* If memcg is bound to the default hierarchy, css of the memcg associated
* with @page is returned. The returned css remains associated with @page
* until it is released.
*
* If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
* is returned.
*/
struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
{
struct mem_cgroup *memcg;
memcg = page->mem_cgroup;
if (!memcg)
memcg = root_mem_cgroup;
return &memcg->css;
}
/**
* page_cgroup_ino - return inode number of the memcg a page is charged to
* @page: the page
*
* Look up the closest online ancestor of the memory cgroup @page is charged to
* and return its inode number or 0 if @page is not charged to any cgroup. It
* is safe to call this function without holding a reference to @page.
*
* Note, this function is inherently racy, because there is nothing to prevent
* the cgroup inode from getting torn down and potentially reallocated a moment
* after page_cgroup_ino() returns, so it only should be used by callers that
* do not care (such as procfs interfaces).
*/
ino_t page_cgroup_ino(struct page *page)
{
struct mem_cgroup *memcg;
unsigned long ino = 0;
rcu_read_lock();
memcg = page->mem_cgroup;
/*
* The lowest bit set means that memcg isn't a valid
* memcg pointer, but a obj_cgroups pointer.
* In this case the page is shared and doesn't belong
* to any specific memory cgroup.
*/
if ((unsigned long) memcg & 0x1UL)
memcg = NULL;
while (memcg && !(memcg->css.flags & CSS_ONLINE))
memcg = parent_mem_cgroup(memcg);
if (memcg)
ino = cgroup_ino(memcg->css.cgroup);
rcu_read_unlock();
return ino;
}
static struct mem_cgroup_per_node *
mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
{
int nid = page_to_nid(page);
return memcg->nodeinfo[nid];
}
static struct mem_cgroup_tree_per_node *
soft_limit_tree_node(int nid)
{
return soft_limit_tree.rb_tree_per_node[nid];
}
static struct mem_cgroup_tree_per_node *
soft_limit_tree_from_page(struct page *page)
{
int nid = page_to_nid(page);
return soft_limit_tree.rb_tree_per_node[nid];
}
static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
struct mem_cgroup_tree_per_node *mctz,
unsigned long new_usage_in_excess)
{
struct rb_node **p = &mctz->rb_root.rb_node;
struct rb_node *parent = NULL;
struct mem_cgroup_per_node *mz_node;
bool rightmost = true;
if (mz->on_tree)
return;
mz->usage_in_excess = new_usage_in_excess;
if (!mz->usage_in_excess)
return;
while (*p) {
parent = *p;
mz_node = rb_entry(parent, struct mem_cgroup_per_node,
tree_node);
if (mz->usage_in_excess < mz_node->usage_in_excess) {
p = &(*p)->rb_left;
rightmost = false;
}
/*
* We can't avoid mem cgroups that are over their soft
* limit by the same amount
*/
else if (mz->usage_in_excess >= mz_node->usage_in_excess)
p = &(*p)->rb_right;
}
if (rightmost)
mctz->rb_rightmost = &mz->tree_node;
rb_link_node(&mz->tree_node, parent, p);
rb_insert_color(&mz->tree_node, &mctz->rb_root);
mz->on_tree = true;
}
static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
struct mem_cgroup_tree_per_node *mctz)
{
if (!mz->on_tree)
return;
if (&mz->tree_node == mctz->rb_rightmost)
mctz->rb_rightmost = rb_prev(&mz->tree_node);
rb_erase(&mz->tree_node, &mctz->rb_root);
mz->on_tree = false;
}
static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
struct mem_cgroup_tree_per_node *mctz)
{
unsigned long flags;
spin_lock_irqsave(&mctz->lock, flags);
__mem_cgroup_remove_exceeded(mz, mctz);
spin_unlock_irqrestore(&mctz->lock, flags);
}
static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
{
unsigned long nr_pages = page_counter_read(&memcg->memory);
unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
unsigned long excess = 0;
if (nr_pages > soft_limit)
excess = nr_pages - soft_limit;
return excess;
}
static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
{
unsigned long excess;
struct mem_cgroup_per_node *mz;
struct mem_cgroup_tree_per_node *mctz;
mctz = soft_limit_tree_from_page(page);
if (!mctz)
return;
/*
* Necessary to update all ancestors when hierarchy is used.
* because their event counter is not touched.
*/
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
mz = mem_cgroup_page_nodeinfo(memcg, page);
excess = soft_limit_excess(memcg);
/*
* We have to update the tree if mz is on RB-tree or
* mem is over its softlimit.
*/
if (excess || mz->on_tree) {
unsigned long flags;
spin_lock_irqsave(&mctz->lock, flags);
/* if on-tree, remove it */
if (mz->on_tree)
__mem_cgroup_remove_exceeded(mz, mctz);
/*
* Insert again. mz->usage_in_excess will be updated.
* If excess is 0, no tree ops.
*/
__mem_cgroup_insert_exceeded(mz, mctz, excess);
spin_unlock_irqrestore(&mctz->lock, flags);
}
}
}
static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
{
struct mem_cgroup_tree_per_node *mctz;
struct mem_cgroup_per_node *mz;
int nid;
for_each_node(nid) {
mz = mem_cgroup_nodeinfo(memcg, nid);
mctz = soft_limit_tree_node(nid);
if (mctz)
mem_cgroup_remove_exceeded(mz, mctz);
}
}
static struct mem_cgroup_per_node *
__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
{
struct mem_cgroup_per_node *mz;
retry:
mz = NULL;
if (!mctz->rb_rightmost)
goto done; /* Nothing to reclaim from */
mz = rb_entry(mctz->rb_rightmost,
struct mem_cgroup_per_node, tree_node);
/*
* Remove the node now but someone else can add it back,
* we will to add it back at the end of reclaim to its correct
* position in the tree.
*/
__mem_cgroup_remove_exceeded(mz, mctz);
if (!soft_limit_excess(mz->memcg) ||
!css_tryget_online(&mz->memcg->css))
goto retry;
done:
return mz;
}
static struct mem_cgroup_per_node *
mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
{
struct mem_cgroup_per_node *mz;
spin_lock_irq(&mctz->lock);
mz = __mem_cgroup_largest_soft_limit_node(mctz);
spin_unlock_irq(&mctz->lock);
return mz;
}
/**
* __mod_memcg_state - update cgroup memory statistics
* @memcg: the memory cgroup
* @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
* @val: delta to add to the counter, can be negative
*/
void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
{
long x, threshold = MEMCG_CHARGE_BATCH;
if (mem_cgroup_disabled())
return;
if (vmstat_item_in_bytes(idx))
threshold <<= PAGE_SHIFT;
x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
if (unlikely(abs(x) > threshold)) {
struct mem_cgroup *mi;
/*
* Batch local counters to keep them in sync with
* the hierarchical ones.
*/
__this_cpu_add(memcg->vmstats_local->stat[idx], x);
for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
atomic_long_add(x, &mi->vmstats[idx]);
x = 0;
}
__this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
#ifdef CONFIG_MEM_QOS
if (idx == NR_FILE_PAGES) {
if (val > 0)
page_counter_charge(&memcg->pagecache, val);
else
page_counter_uncharge(&memcg->pagecache, -val);
}
#endif
}
static struct mem_cgroup_per_node *
parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
{
struct mem_cgroup *parent;
parent = parent_mem_cgroup(pn->memcg);
if (!parent)
return NULL;
return mem_cgroup_nodeinfo(parent, nid);
}
void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
int val)
{
struct mem_cgroup_per_node *pn;
struct mem_cgroup *memcg;
long x, threshold = MEMCG_CHARGE_BATCH;
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
memcg = pn->memcg;
/* Update memcg */
__mod_memcg_state(memcg, idx, val);
/* Update lruvec */
__this_cpu_add(pn->lruvec_stat_local->count[idx], val);
if (vmstat_item_in_bytes(idx))
threshold <<= PAGE_SHIFT;
x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
if (unlikely(abs(x) > threshold)) {
pg_data_t *pgdat = lruvec_pgdat(lruvec);
struct mem_cgroup_per_node *pi;
for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
atomic_long_add(x, &pi->lruvec_stat[idx]);
x = 0;
}
__this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
}
/**
* __mod_lruvec_state - update lruvec memory statistics
* @lruvec: the lruvec
* @idx: the stat item
* @val: delta to add to the counter, can be negative
*
* The lruvec is the intersection of the NUMA node and a cgroup. This
* function updates the all three counters that are affected by a
* change of state at this level: per-node, per-cgroup, per-lruvec.
*/
void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
int val)
{
/* Update node */
__mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
/* Update memcg and lruvec */
if (!mem_cgroup_disabled())
__mod_memcg_lruvec_state(lruvec, idx, val);
}
void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
{
pg_data_t *pgdat = page_pgdat(virt_to_page(p));
struct mem_cgroup *memcg;
struct lruvec *lruvec;
rcu_read_lock();
memcg = mem_cgroup_from_obj(p);
/*
* Untracked pages have no memcg, no lruvec. Update only the
* node. If we reparent the slab objects to the root memcg,
* when we free the slab object, we need to update the per-memcg
* vmstats to keep it correct for the root memcg.
*/
if (!memcg) {
__mod_node_page_state(pgdat, idx, val);
} else {
lruvec = mem_cgroup_lruvec(memcg, pgdat);
__mod_lruvec_state(lruvec, idx, val);
}
rcu_read_unlock();
}
void mod_memcg_obj_state(void *p, int idx, int val)
{
struct mem_cgroup *memcg;
rcu_read_lock();
memcg = mem_cgroup_from_obj(p);
if (memcg)
mod_memcg_state(memcg, idx, val);
rcu_read_unlock();
}
/*
* mod_objcg_mlstate() may be called with irq enabled, so
* mod_memcg_lruvec_state() should be used.
*/
static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
struct pglist_data *pgdat,
enum node_stat_item idx, int nr)
{
struct mem_cgroup *memcg;
struct lruvec *lruvec;
rcu_read_lock();
memcg = obj_cgroup_memcg(objcg);
lruvec = mem_cgroup_lruvec(memcg, pgdat);
mod_memcg_lruvec_state(lruvec, idx, nr);
rcu_read_unlock();
}
/**
* __count_memcg_events - account VM events in a cgroup
* @memcg: the memory cgroup
* @idx: the event item
* @count: the number of events that occured
*/
void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
unsigned long count)
{
unsigned long x;
if (mem_cgroup_disabled())
return;
x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
if (unlikely(x > MEMCG_CHARGE_BATCH)) {
struct mem_cgroup *mi;
/*
* Batch local counters to keep them in sync with
* the hierarchical ones.
*/
__this_cpu_add(memcg->vmstats_local->events[idx], x);
for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
atomic_long_add(x, &mi->vmevents[idx]);
x = 0;
}
__this_cpu_write(memcg->vmstats_percpu->events[idx], x);
}
static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
{
return atomic_long_read(&memcg->vmevents[event]);
}
static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
{
long x = 0;
int cpu;
for_each_possible_cpu(cpu)
x += per_cpu(memcg->vmstats_local->events[event], cpu);
return x;
}
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
int nr_pages)
{
/* pagein of a big page is an event. So, ignore page size */
if (nr_pages > 0)
__count_memcg_events(memcg, PGPGIN, 1);
else {
__count_memcg_events(memcg, PGPGOUT, 1);
nr_pages = -nr_pages; /* for event */
}
__this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
}
static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
enum mem_cgroup_events_target target)
{
unsigned long val, next;
val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
/* from time_after() in jiffies.h */
if ((long)(next - val) < 0) {
switch (target) {
case MEM_CGROUP_TARGET_THRESH:
next = val + THRESHOLDS_EVENTS_TARGET;
break;
case MEM_CGROUP_TARGET_SOFTLIMIT:
next = val + SOFTLIMIT_EVENTS_TARGET;
break;
case MEM_CGROUP_TARGET_NUMAINFO:
next = val + NUMAINFO_EVENTS_TARGET;
break;
default:
break;
}
__this_cpu_write(memcg->vmstats_percpu->targets[target], next);
return true;
}
return false;
}
/*
* Check events in order.
*
*/
static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
{
/* threshold event is triggered in finer grain than soft limit */
if (unlikely(mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_THRESH))) {
bool do_softlimit;
bool do_numainfo __maybe_unused;
do_softlimit = mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_SOFTLIMIT);
#if MAX_NUMNODES > 1
do_numainfo = mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_NUMAINFO);
#endif
mem_cgroup_threshold(memcg);
if (unlikely(do_softlimit))
mem_cgroup_update_tree(memcg, page);
#if MAX_NUMNODES > 1
if (unlikely(do_numainfo))
atomic_inc(&memcg->numainfo_events);
#endif
}
}
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
{
/*
* mm_update_next_owner() may clear mm->owner to NULL
* if it races with swapoff, page migration, etc.
* So this can be called with p == NULL.
*/
if (unlikely(!p))
return NULL;
return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
}
EXPORT_SYMBOL(mem_cgroup_from_task);
/**
* get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
* @mm: mm from which memcg should be extracted. It can be NULL.
*
* Obtain a reference on mm->memcg and returns it if successful. Otherwise
* root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is
* returned.
*/
struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
{
struct mem_cgroup *memcg;
if (mem_cgroup_disabled())
return NULL;
rcu_read_lock();
do {
/*
* Page cache insertions can happen withou an
* actual mm context, e.g. during disk probing
* on boot, loopback IO, acct() writes etc.
*/
if (unlikely(!mm))
memcg = root_mem_cgroup;
else {
memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
if (unlikely(!memcg))
memcg = root_mem_cgroup;
}
} while (!css_tryget(&memcg->css));
rcu_read_unlock();
return memcg;
}
EXPORT_SYMBOL(get_mem_cgroup_from_mm);
/**
* get_mem_cgroup_from_page: Obtain a reference on given page's memcg.
* @page: page from which memcg should be extracted.
*
* Obtain a reference on page->memcg and returns it if successful. Otherwise
* root_mem_cgroup is returned.
*/
struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
{
struct mem_cgroup *memcg = page->mem_cgroup;
if (mem_cgroup_disabled())
return NULL;
rcu_read_lock();
if (!memcg || !css_tryget_online(&memcg->css))
memcg = root_mem_cgroup;
rcu_read_unlock();
return memcg;
}
EXPORT_SYMBOL(get_mem_cgroup_from_page);
/**
* If current->active_memcg is non-NULL, do not fallback to current->mm->memcg.
*/
static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
{
if (unlikely(current->active_memcg)) {
struct mem_cgroup *memcg = root_mem_cgroup;
rcu_read_lock();
if (css_tryget_online(&current->active_memcg->css))
memcg = current->active_memcg;
rcu_read_unlock();
return memcg;
}
return get_mem_cgroup_from_mm(current->mm);
}
/**
* mem_cgroup_iter - iterate over memory cgroup hierarchy
* @root: hierarchy root
* @prev: previously returned memcg, NULL on first invocation
* @reclaim: cookie for shared reclaim walks, NULL for full walks
*
* Returns references to children of the hierarchy below @root, or
* @root itself, or %NULL after a full round-trip.
*
* Caller must pass the return value in @prev on subsequent
* invocations for reference counting, or use mem_cgroup_iter_break()
* to cancel a hierarchy walk before the round-trip is complete.
*
* Reclaimers can specify a node and a priority level in @reclaim to
* divide up the memcgs in the hierarchy among all concurrent
* reclaimers operating on the same node and priority.
*/
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
struct mem_cgroup_reclaim_cookie *reclaim)
{
struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
struct cgroup_subsys_state *css = NULL;
struct mem_cgroup *memcg = NULL;
struct mem_cgroup *pos = NULL;
if (mem_cgroup_disabled())
return NULL;
if (!root)
root = root_mem_cgroup;
if (prev && !reclaim)
pos = prev;
if (!root->use_hierarchy && root != root_mem_cgroup) {
if (prev)
goto out;
return root;
}
rcu_read_lock();
if (reclaim) {
struct mem_cgroup_per_node *mz;
mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
iter = &mz->iter[reclaim->priority];
if (prev && reclaim->generation != iter->generation)
goto out_unlock;
while (1) {
pos = READ_ONCE(iter->position);
if (!pos || css_tryget(&pos->css))
break;
/*
* css reference reached zero, so iter->position will
* be cleared by ->css_released. However, we should not
* rely on this happening soon, because ->css_released
* is called from a work queue, and by busy-waiting we
* might block it. So we clear iter->position right
* away.
*/
(void)cmpxchg(&iter->position, pos, NULL);
}
}
if (pos)
css = &pos->css;
for (;;) {
css = css_next_descendant_pre(css, &root->css);
if (!css) {
/*
* Reclaimers share the hierarchy walk, and a
* new one might jump in right at the end of
* the hierarchy - make sure they see at least
* one group and restart from the beginning.
*/
if (!prev)
continue;
break;
}
/*
* Verify the css and acquire a reference. The root
* is provided by the caller, so we know it's alive
* and kicking, and don't take an extra reference.
*/
memcg = mem_cgroup_from_css(css);
if (css == &root->css)
break;
if (css_tryget(css))
break;
memcg = NULL;
}
if (reclaim) {
/*
* The position could have already been updated by a competing
* thread, so check that the value hasn't changed since we read
* it to avoid reclaiming from the same cgroup twice.
*/
(void)cmpxchg(&iter->position, pos, memcg);
if (pos)
css_put(&pos->css);
if (!memcg)
iter->generation++;
else if (!prev)
reclaim->generation = iter->generation;
}
out_unlock:
rcu_read_unlock();
out:
if (prev && prev != root)
css_put(&prev->css);
return memcg;
}
/**
* mem_cgroup_iter_break - abort a hierarchy walk prematurely
* @root: hierarchy root
* @prev: last visited hierarchy member as returned by mem_cgroup_iter()
*/
void mem_cgroup_iter_break(struct mem_cgroup *root,
struct mem_cgroup *prev)
{
if (!root)
root = root_mem_cgroup;
if (prev && prev != root)
css_put(&prev->css);
}
static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
struct mem_cgroup *dead_memcg)
{
struct mem_cgroup_reclaim_iter *iter;
struct mem_cgroup_per_node *mz;
int nid;
int i;
for_each_node(nid) {
mz = mem_cgroup_nodeinfo(from, nid);
for (i = 0; i <= DEF_PRIORITY; i++) {
iter = &mz->iter[i];
cmpxchg(&iter->position,
dead_memcg, NULL);
}
}
}
static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
{
struct mem_cgroup *memcg = dead_memcg;
struct mem_cgroup *last;
do {
__invalidate_reclaim_iterators(memcg, dead_memcg);
last = memcg;
} while ((memcg = parent_mem_cgroup(memcg)));
/*
* When cgruop1 non-hierarchy mode is used,
* parent_mem_cgroup() does not walk all the way up to the
* cgroup root (root_mem_cgroup). So we have to handle
* dead_memcg from cgroup root separately.
*/
if (last != root_mem_cgroup)
__invalidate_reclaim_iterators(root_mem_cgroup,
dead_memcg);
}
/*
* Test whether @memcg has children, dead or alive. Note that this
* function doesn't care whether @memcg has use_hierarchy enabled and
* returns %true if there are child csses according to the cgroup
* hierarchy. Testing use_hierarchy is the caller's responsiblity.
*/
static inline bool memcg_has_children(struct mem_cgroup *memcg)
{
bool ret;
rcu_read_lock();
ret = css_next_child(NULL, &memcg->css);
rcu_read_unlock();
return ret;
}
#ifdef CONFIG_MEM_QOS
/* memcg oom priority */
/*
* do_mem_cgroup_account_oom_skip - account the memcg with OOM-unkillable task
* @memcg: mem_cgroup struct with OOM-unkillable task
* @oc: oom_control struct
*
* Account OOM-unkillable task to its cgroup and up to the OOMing cgroup's
* @num_oom_skip, if any one of the tasks of one cgroup hierarchy are OOM-unkillable
* we skip this cgroup hierarchy when select the victim cgroup.
*
* The @num_oom_skip must be reset when bad process selection has finished,
* since before the next round bad process selection, these OOM-unkillable
* tasks might become killable.
*
*/
static void do_mem_cgroup_account_oom_skip(struct mem_cgroup *memcg,
struct oom_control *oc)
{
struct mem_cgroup *root;
struct cgroup_subsys_state *css;
if (!oc->priority_select)
return;
if (unlikely(!memcg))
return;
root = oc->memcg;
if (!root)
root = root_mem_cgroup;
css = &memcg->css;
while (css) {
struct mem_cgroup *tmp;
tmp = mem_cgroup_from_css(css);
tmp->num_oom_skip++;
/*
* Put these cgroups into a list to
* reduce the iteration time when reset
* the @num_oom_skip.
*/
if (!tmp->next_reset) {
css_get(&tmp->css);
tmp->next_reset = oc->reset_list;
oc->reset_list = tmp;
}
if (mem_cgroup_from_css(css) == root)
break;
css = css->parent;
}
}
void mem_cgroup_account_oom_skip(struct task_struct *task,
struct oom_control *oc)
{
do_mem_cgroup_account_oom_skip(mem_cgroup_from_task(task), oc);
}
/*
* __mem_cgroup_select_victim - select the victim memcg based on the base_prio
* @parent: corresponding cgroup_subsys_state struct of memcg
* @base_prio: lowest priority.
*
* Note:
* a. Rules of comparison: priority first, then page counter
* b. The smaller the number, the higher the priority
*
*/
static struct cgroup_subsys_state *
__mem_cgroup_select_victim(struct cgroup_subsys_state *parent,
int base_prio)
{
struct cgroup_subsys_state *chosen = NULL;
int chosen_priority;
struct mem_cgroup *iter, *memcg, *chosen_memcg;
/* no proc or all unkillable */
if (!parent->nr_procs ||
parent->nr_procs <= mem_cgroup_from_css(parent)->num_oom_skip)
return NULL;
chosen_priority = base_prio;
/* chosen = parent when all tasks are in parent */
if (cgroup_priority(parent) >= chosen_priority)
chosen = parent;
memcg = mem_cgroup_from_css(parent);
for_each_mem_cgroup_tree(iter, memcg) {
struct cgroup_subsys_state *css = &iter->css;
int prio = cgroup_priority(css);
if (css->nr_procs <= iter->num_oom_skip)
continue;
if (prio < chosen_priority)
continue;
else if (prio > chosen_priority || !chosen) {
chosen_priority = prio;
chosen = css;
continue;
}
chosen_memcg = mem_cgroup_from_css(chosen);
/* equal priority check memory usage */
if (do_memsw_account()) {
if (page_counter_read(&iter->memsw) >
page_counter_read(&chosen_memcg->memsw))
chosen = css;
} else if (page_counter_read(&iter->memory) >
page_counter_read(&chosen_memcg->memory)) {
chosen = css;
}
}
return chosen;
}
static struct mem_cgroup *
mem_cgroup_select_victim_cgroup(struct mem_cgroup *memcg, int base_prio)
{
struct cgroup_subsys_state *parent, *victim;
/* if priority reclaim's target priority larger than max return null */
if (base_prio >= CGROUP_PRIORITY_MAX)
return NULL;
if (!memcg->use_hierarchy) {
/* memcg higher priority, return NULL */
if (cgroup_priority(&memcg->css) < base_prio)
return NULL;
css_get(&memcg->css);
return memcg;
}
again:
rcu_read_lock();
parent = &memcg->css;
victim = __mem_cgroup_select_victim(parent, base_prio);
if (likely(victim)) {
rcu_read_unlock();
if (!css_tryget(victim))
goto again;
return mem_cgroup_from_css(victim);
}
rcu_read_unlock();
return NULL;
}
#endif /* CONFIG_MEM_QOS */
/**
* mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
* @memcg: hierarchy root
* @fn: function to call for each task
* @arg: argument passed to @fn
*
* This function iterates over tasks attached to @memcg or to any of its
* descendants and calls @fn for each task. If @fn returns a non-zero
* value, the function breaks the iteration loop and returns the value.
* Otherwise, it will iterate over all tasks and return 0.
*
*/
int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
int (*fn)(struct task_struct *, void *), void *arg)
{
struct mem_cgroup *iter;
int ret = 0;
#ifndef CONFIG_MEM_QOS
BUG_ON(memcg == root_mem_cgroup);
#endif
for_each_mem_cgroup_tree(iter, memcg) {
struct css_task_iter it;
struct task_struct *task;
css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
while (!ret && (task = css_task_iter_next(&it)))
ret = fn(task, arg);
css_task_iter_end(&it);
if (ret) {
mem_cgroup_iter_break(memcg, iter);
break;
}
}
return ret;
}
#ifdef CONFIG_MEM_QOS
void mem_cgroup_select_bad_process(struct oom_control *oc)
{
struct mem_cgroup *memcg, *victim, *iter;
memcg = oc->memcg;
if (!memcg)
memcg = root_mem_cgroup;
victim = memcg;
retry:
if (oc->priority_select) {
victim = mem_cgroup_select_victim_cgroup(memcg,
oc->base_priority);
if (!victim) {
/* root_memcg and being killed with MMF_OOM_SKIP */
if (mem_cgroup_is_root(memcg) && oc->num_skip)
oc->chosen = (void *)-1UL;
goto out;
}
}
mem_cgroup_scan_tasks(victim, oom_evaluate_task, oc);
if (oc->priority_select) {
css_put(&victim->css);
if (oc->chosen == (void *)-1UL)
goto out;
if (!oc->chosen && victim != memcg) {
do_mem_cgroup_account_oom_skip(victim, oc);
goto retry;
}
}
out:
/* See comments in mem_cgroup_account_oom_skip() */
while (oc->reset_list) {
iter = oc->reset_list;
iter->num_oom_skip = 0;
oc->reset_list = iter->next_reset;
iter->next_reset = NULL;
css_put(&iter->css);
}
}
static int memcg_get_prio(struct mem_cgroup *memcg);
void mem_cgroup_oom_select_bad_process(struct oom_control *oc)
{
struct mem_cgroup *memcg;
memcg = oc->memcg;
if (!memcg)
memcg = root_mem_cgroup;
if (!sysctl_vm_memory_qos)
oc->priority_select = false;
else
oc->priority_select = memcg->use_priority_oom;
oc->base_priority = memcg_get_prio(memcg);
mem_cgroup_select_bad_process(oc);
}
static void priority_kill_process(struct task_struct *victim,
struct oom_control *oc)
{
struct task_struct *p;
struct mm_struct *mm;
struct mem_cgroup *memcg;
p = find_lock_task_mm(victim);
if (!p) {
put_task_struct(victim);
return;
} else if (victim != p) {
get_task_struct(p);
put_task_struct(victim);
victim = p;
}
if ((victim->signal->flags & SIGNAL_GROUP_EXIT) || (thread_group_empty(victim) &&
(victim->flags & PF_EXITING))) {
task_unlock(victim);
put_task_struct(victim);
return;
}
/* Now we select [ victim ] to kill, just record it to mbuf */
memcg = mem_cgroup_from_task(victim);
mbuf_print(memcg->css.cgroup, "memqos: Killing process [ %s ] pid [ %d ] for memory reclaim",
victim->comm, victim->pid);
/* Get a reference to safely compare mm after task_unlock(victim) */
mm = victim->mm;
mmgrab(mm);
do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);
victim->exit_timestamp = ktime_to_ns(ktime_get());
task_unlock(victim);
mmdrop(mm);
exp_thread_group_exit(victim);
put_task_struct(victim);
}
static int priority_kill_memcg_member(struct task_struct *task, void *arg)
{
get_task_struct(task);
priority_kill_process(task, (struct oom_control *)arg);
return 0;
}
void mem_cgroup_priority_reclaim(struct oom_control *oc)
{
struct mem_cgroup *memcg;
/* memcg priority is only used in non-root memory cgroup */
if (!oc->memcg || oc->memcg == root_mem_cgroup)
return;
/*
* Atomic memory allocation may be used in the interrupt context,
* the priority reclaim may take a long time. Therefore we skip
* this case.
*/
if (!(oc->gfp_mask & __GFP_DIRECT_RECLAIM))
return;
if (!mutex_trylock(&oom_lock))
return;
oc->priority_select = true;
oc->base_priority = oc->memcg->css.cgroup->priority + 1;
oc->memcg = root_mem_cgroup;
oc->chosen = NULL;
/* Select the bad process */
mem_cgroup_select_bad_process(oc);
if (oc->chosen && oc->chosen != (void *)-1UL) {
rcu_read_lock();
memcg = mem_cgroup_from_task(oc->chosen);
if (memcg == root_mem_cgroup || !memcg) {
put_task_struct(oc->chosen);
rcu_read_unlock();
mutex_unlock(&oom_lock);
return;
}
css_get(&memcg->css);
rcu_read_unlock();
if (memcg->oom_group) {
mem_cgroup_scan_tasks(memcg,
priority_kill_memcg_member, oc);
put_task_struct(oc->chosen);
} else {
priority_kill_process(oc->chosen, oc);
}
mem_cgroup_put(memcg);
}
mutex_unlock(&oom_lock);
}
#endif
#ifdef CONFIG_DEBUG_VM
void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
{
struct mem_cgroup *memcg;
if (mem_cgroup_disabled())
return;
memcg = page_memcg(page);
if (!memcg)
VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != root_mem_cgroup, page);
else
VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != memcg, page);
}
#endif
/**
* mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
* @page: the page
* @pgdat: pgdat of the page
*
* This function is only safe when following the LRU page isolation
* and putback protocol: the LRU lock must be held, and the page must
* either be PageLRU() or the caller must have isolated/allocated it.
*/
struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
{
struct mem_cgroup_per_node *mz;
struct mem_cgroup *memcg;
struct lruvec *lruvec;
if (mem_cgroup_disabled()) {
lruvec = &pgdat->__lruvec;
goto out;
}
memcg = page_memcg(page);
VM_WARN_ON_ONCE_PAGE(!memcg, page);
if (!memcg)
memcg = root_mem_cgroup;
mz = mem_cgroup_page_nodeinfo(memcg, page);
lruvec = &mz->lruvec;
out:
/*
* Since a node can be onlined after the mem_cgroup was created,
* we have to be prepared to initialize lruvec->zone here;
* and if offlined then reonlined, we need to reinitialize it.
*/
if (unlikely(lruvec->pgdat != pgdat))
lruvec->pgdat = pgdat;
return lruvec;
}
/**
* lock_page_lruvec - lock and return lruvec for a given page.
* @page: the page
*
* This series functions should be used in either conditions:
* PageLRU is cleared or unset
* or page->_refcount is zero
* or page is locked.
*/
struct lruvec *lock_page_lruvec(struct page *page)
{
struct lruvec *lruvec;
struct pglist_data *pgdat = page_pgdat(page);
rcu_read_lock();
lruvec = mem_cgroup_page_lruvec(page, pgdat);
spin_lock(&lruvec->lru_lock);
rcu_read_unlock();
lruvec_memcg_debug(lruvec, page);
return lruvec;
}
struct lruvec *lock_page_lruvec_irq(struct page *page)
{
struct lruvec *lruvec;
struct pglist_data *pgdat = page_pgdat(page);
rcu_read_lock();
lruvec = mem_cgroup_page_lruvec(page, pgdat);
spin_lock_irq(&lruvec->lru_lock);
rcu_read_unlock();
lruvec_memcg_debug(lruvec, page);
return lruvec;
}
struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags)
{
struct lruvec *lruvec;
struct pglist_data *pgdat = page_pgdat(page);
rcu_read_lock();
lruvec = mem_cgroup_page_lruvec(page, pgdat);
spin_lock_irqsave(&lruvec->lru_lock, *flags);
rcu_read_unlock();
lruvec_memcg_debug(lruvec, page);
return lruvec;
}
/**
* mem_cgroup_update_lru_size - account for adding or removing an lru page
* @lruvec: mem_cgroup per zone lru vector
* @lru: index of lru list the page is sitting on
* @zid: zone id of the accounted pages
* @nr_pages: positive when adding or negative when removing
*
* This function must be called under lru_lock, just before a page is added
* to or just after a page is removed from an lru list (that ordering being
* so as to allow it to check that lru_size 0 is consistent with list_empty).
*/
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
int zid, int nr_pages)
{
struct mem_cgroup_per_node *mz;
unsigned long *lru_size;
long size;
if (mem_cgroup_disabled())
return;
mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
lru_size = &mz->lru_zone_size[zid][lru];
if (nr_pages < 0)
*lru_size += nr_pages;
size = *lru_size;
if (WARN_ONCE(size < 0,
"%s(%p, %d, %d): lru_size %ld\n",
__func__, lruvec, lru, nr_pages, size)) {
VM_BUG_ON(1);
*lru_size = 0;
}
if (nr_pages > 0)
*lru_size += nr_pages;
}
/**
* mem_cgroup_margin - calculate chargeable space of a memory cgroup
* @memcg: the memory cgroup
*
* Returns the maximum amount of memory @mem can be charged with, in
* pages.
*/
static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
{
unsigned long margin = 0;
unsigned long count;
unsigned long limit;
count = page_counter_read(&memcg->memory);
limit = READ_ONCE(memcg->memory.max);
if (count < limit)
margin = limit - count;
if (do_memsw_account()) {
count = page_counter_read(&memcg->memsw);
limit = READ_ONCE(memcg->memsw.max);
if (count <= limit)
margin = min(margin, limit - count);
else
margin = 0;
}
return margin;
}
/*
* A routine for checking "mem" is under move_account() or not.
*
* Checking a cgroup is mc.from or mc.to or under hierarchy of
* moving cgroups. This is for waiting at high-memory pressure
* caused by "move".
*/
static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
{
struct mem_cgroup *from;
struct mem_cgroup *to;
bool ret = false;
/*
* Unlike task_move routines, we access mc.to, mc.from not under
* mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
*/
spin_lock(&mc.lock);
from = mc.from;
to = mc.to;
if (!from)
goto unlock;
ret = mem_cgroup_is_descendant(from, memcg) ||
mem_cgroup_is_descendant(to, memcg);
unlock:
spin_unlock(&mc.lock);
return ret;
}
static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
{
if (mc.moving_task && current != mc.moving_task) {
if (mem_cgroup_under_move(memcg)) {
DEFINE_WAIT(wait);
prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
/* moving charge context might have finished. */
if (mc.moving_task)
schedule();
finish_wait(&mc.waitq, &wait);
return true;
}
}
return false;
}
#ifdef CONFIG_MEM_SPEED_THROTTLE
static void mem_cgroup_mst_msc_reset(struct mem_cgroup *memcg)
{
struct mem_spd_ctl *msc;
if (mem_cgroup_is_root(memcg))
return;
msc = &memcg->msc;
/* Make sure reset succeed */
while (msc->prev_thl_jifs ||
msc->prev_chg ||
msc->slice_lmt ||
msc->mem_spd_lmt ||
memcg->has_msc_lmt ||
atomic_long_read(&msc->nr_throttled)) {
atomic_long_set(&msc->nr_throttled, 0);
memcg->has_msc_lmt = 0;
msc->mem_spd_lmt = 0;
msc->slice_lmt = 0;
msc->prev_chg = 0;
msc->prev_thl_jifs = 0;
}
}
static void mem_cgroup_mst_has_lmt_init(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter = memcg;
struct cgroup_subsys_state *css = &memcg->css;
rcu_read_lock();
while (!mem_cgroup_is_root(iter)) {
if (iter->msc.mem_spd_lmt != 0) {
memcg->has_msc_lmt = 1;
return;
}
css = css->parent;
iter = mem_cgroup_from_css(css);
}
rcu_read_unlock();
}
static u64 mem_cgroup_mem_spd_lmt_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
return (u64)(memcg->msc.mem_spd_lmt << PAGE_SHIFT);
}
static int mem_cgroup_mem_spd_lmt_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
struct mem_cgroup *memcg, *iter;
unsigned long lmt;
memcg = mem_cgroup_from_css(css);
lmt = val >> PAGE_SHIFT;
memcg->msc.mem_spd_lmt = lmt;
memcg->msc.slice_lmt = lmt * MST_SLICE / HZ;
/* Sync with mst_has_lmt_init and mem_cgroup_mst_overlmt_tree */
synchronize_rcu();
if (lmt) {
for_each_mem_cgroup_tree(iter, memcg)
iter->has_msc_lmt = 1;
}
return 0;
}
/*
* Update the memory speed throttle slice window.
* Return 0 when we still in the previous slice window
* Return 1 when we start a new slice window
*/
static int mem_cgroup_update_mst_slice(struct mem_cgroup *memcg)
{
unsigned long total_charge;
struct mem_spd_ctl *msc = &memcg->msc;
if (msc->prev_thl_jifs &&
time_before(jiffies, (msc->prev_thl_jifs + MST_SLICE)))
return 0;
/* Bail out if other's updating */
if (atomic_cmpxchg(&msc->updating, 0, 1) != 0)
return 0;
total_charge = atomic_long_read(&memcg->memory.total_chg);
msc->prev_chg = total_charge;
msc->prev_thl_jifs = jiffies;
atomic_set(&msc->updating, 0);
return 1;
}
/*
* Check if the memory allocate speed is exceed the limits.
* Return 0 when it is within the limits.
* Return the value of actual time frame should be used for the
* allocation when it exceeds the limits.
*/
static int mem_cgroup_mst_overspd(struct mem_cgroup *memcg)
{
struct mem_spd_ctl *msc = &memcg->msc;
unsigned long total_charge;
unsigned long usage = 0;
if (mem_cgroup_update_mst_slice(memcg))
return 0;
total_charge = atomic_long_read(&memcg->memory.total_chg);
usage = total_charge <= msc->prev_chg ?
0 : total_charge - msc->prev_chg;
if (usage < msc->slice_lmt)
return 0;
else
return (usage * MST_SLICE) / msc->slice_lmt;
}
static int mem_cgroup_mst_overspd_tree(struct mem_cgroup *memcg)
{
struct cgroup_subsys_state *css = &memcg->css;
struct mem_cgroup *iter = memcg;
int no_lmt = 1;
int ret = 0;
rcu_read_lock();
while (!mem_cgroup_is_root(iter)) {
if (iter->msc.mem_spd_lmt) {
no_lmt = 0;
ret = mem_cgroup_mst_overspd(iter);
if (ret) {
rcu_read_unlock();
return ret;
}
}
css = css->parent;
iter = mem_cgroup_from_css(css);
}
/* Mst has been disabled */
if (no_lmt)
mem_cgroup_mst_msc_reset(memcg);
rcu_read_unlock();
return ret;
}
static enum mst_wmark_stat mem_cgroup_mst_wmark_ok(struct mem_cgroup *memcg)
{
int nid;
for_each_online_node(nid) {
unsigned long free, low, min;
struct zone *zone;
zone = &NODE_DATA(nid)->node_zones[ZONE_NORMAL];
free = zone_page_state(zone, NR_FREE_PAGES);
low = low_wmark_pages(zone);
min = min_wmark_pages(zone);
min += (low - min) >> 2;
if (free <= min)
return WMARK_REACH_MIN;
if (free <= low)
return WMARK_REACH_LOW;
}
return WMARK_OK;
}
static void mem_cgroup_mst_spd_throttle(struct mem_cgroup *memcg)
{
struct mem_spd_ctl *msc = &memcg->msc;
long timeout;
int ret = 0;
enum mst_wmark_stat stat;
if (!memcg->has_msc_lmt || in_interrupt() || in_atomic() ||
irqs_disabled() || oops_in_progress)
return;
stat = mem_cgroup_mst_wmark_ok(memcg);
if (stat == WMARK_OK)
return;
ret = mem_cgroup_mst_overspd_tree(memcg);
if (stat == WMARK_REACH_LOW && !ret)
return;
atomic_long_inc(&msc->nr_throttled);
/*
* Throttle the allocation for amount of jiffies according to
* the fraction between the actual memory usage and allowed
* allocation speed
*/
timeout = ret - (jiffies - msc->prev_thl_jifs);
if (timeout > 0)
schedule_timeout_interruptible(timeout);
}
static unsigned long mem_cgroup_mst_get_mem_spd_max(struct mem_cgroup *memcg)
{
struct page_counter *c = &memcg->memory;
unsigned long deadline;
/* Reset speed if it is too old */
deadline = atomic_long_read(&c->prev_spd_jifs) + 5 * HZ;
if (time_after(jiffies, deadline))
atomic_long_set(&c->mem_spd_max, 0);
/* Clear after read */
return atomic_long_xchg(&c->mem_spd_max, 0) << PAGE_SHIFT;
}
static void mem_cgroup_mst_show_mem_spd_max(struct mem_cgroup *memcg,
struct seq_file *m)
{
if (mem_cgroup_is_root(memcg))
return;
seq_printf(m, "mst_mem_spd_max %lu\n",
mem_cgroup_mst_get_mem_spd_max(memcg));
}
static void mem_cgroup_mst_show_nr_throttled(struct mem_cgroup *memcg,
struct seq_file *m)
{
seq_printf(m, "mst_nr_throttled %lu\n",
atomic_long_read(&memcg->msc.nr_throttled));
}
#else /* CONFIG_MEM_SPEED_THROTTLE */
static void mem_cgroup_mst_has_lmt_init(struct mem_cgroup *memcg)
{
}
static void mem_cgroup_mst_spd_throttle(struct mem_cgroup *memcg)
{
}
static void mem_cgroup_mst_msc_reset(struct mem_cgroup *memcg)
{
}
static void mem_cgroup_mst_show_nr_throttled(struct mem_cgroup *memcg,
struct seq_file *m)
{
}
static void mem_cgroup_mst_show_mem_spd_max(struct mem_cgroup *memcg,
struct seq_file *m)
{
}
#endif
static char *memory_stat_format(struct mem_cgroup *memcg)
{
struct seq_buf s;
int i;
seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
if (!s.buffer)
return NULL;
/*
* Provide statistics on the state of the memory subsystem as
* well as cumulative event counters that show past behavior.
*
* This list is ordered following a combination of these gradients:
* 1) generic big picture -> specifics and details
* 2) reflecting userspace activity -> reflecting kernel heuristics
*
* Current memory state:
*/
seq_buf_printf(&s, "anon %llu\n",
(u64)memcg_page_state(memcg, NR_ANON_MAPPED) *
PAGE_SIZE);
seq_buf_printf(&s, "file %llu\n",
(u64)memcg_page_state(memcg, NR_FILE_PAGES) *
PAGE_SIZE);
seq_buf_printf(&s, "kernel_stack %llu\n",
(u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) *
1024);
seq_buf_printf(&s, "slab %llu\n",
(u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B)));
seq_buf_printf(&s, "sock %llu\n",
(u64)memcg_page_state(memcg, MEMCG_SOCK) *
PAGE_SIZE);
seq_buf_printf(&s, "shmem %llu\n",
(u64)memcg_page_state(memcg, NR_SHMEM) *
PAGE_SIZE);
#if defined(CONFIG_NEED_MEMCG_ZRAM)
seq_buf_printf(&s, "zram %llu\n",
(u64)memcg_page_state(memcg, MEMCG_ZRAM_B));
seq_buf_printf(&s, "zrammed %llu\n",
(u64)memcg_page_state(memcg, MEMCG_ZRAMED) * PAGE_SIZE);
#endif
seq_buf_printf(&s, "file_mapped %llu\n",
(u64)memcg_page_state(memcg, NR_FILE_MAPPED) *
PAGE_SIZE);
seq_buf_printf(&s, "file_dirty %llu\n",
(u64)memcg_page_state(memcg, NR_FILE_DIRTY) *
PAGE_SIZE);
seq_buf_printf(&s, "file_writeback %llu\n",
(u64)memcg_page_state(memcg, NR_WRITEBACK) *
PAGE_SIZE);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
seq_buf_printf(&s, "anon_thp %llu\n",
(u64)memcg_page_state(memcg, NR_ANON_THPS) *
HPAGE_PMD_SIZE);
#endif
for (i = 0; i < NR_LRU_LISTS; i++)
seq_buf_printf(&s, "%s %llu\n", lru_list_name(i),
(u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
PAGE_SIZE);
seq_buf_printf(&s, "slab_reclaimable %llu\n",
(u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B));
seq_buf_printf(&s, "slab_unreclaimable %llu\n",
(u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B));
/* Accumulated memory events */
seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
memcg_events(memcg, PGFAULT));
seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
memcg_events(memcg, PGMAJFAULT));
seq_buf_printf(&s, "workingset_refault_anon %lu\n",
memcg_page_state(memcg, WORKINGSET_REFAULT_ANON));
seq_buf_printf(&s, "workingset_refault_file %lu\n",
memcg_page_state(memcg, WORKINGSET_REFAULT_FILE));
seq_buf_printf(&s, "workingset_activate_anon %lu\n",
memcg_page_state(memcg, WORKINGSET_ACTIVATE_ANON));
seq_buf_printf(&s, "workingset_activate_file %lu\n",
memcg_page_state(memcg, WORKINGSET_ACTIVATE_FILE));
seq_buf_printf(&s, "workingset_restore %lu\n",
memcg_page_state(memcg, WORKINGSET_RESTORE_ANON));
seq_buf_printf(&s, "workingset_restore %lu\n",
memcg_page_state(memcg, WORKINGSET_RESTORE_FILE));
seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL),
memcg_events(memcg, PGREFILL));
seq_buf_printf(&s, "pgscan %lu\n",
memcg_events(memcg, PGSCAN_KSWAPD) +
memcg_events(memcg, PGSCAN_DIRECT));
seq_buf_printf(&s, "pgsteal %lu\n",
memcg_events(memcg, PGSTEAL_KSWAPD) +
memcg_events(memcg, PGSTEAL_DIRECT));
#ifdef CONFIG_MEM_QOS
seq_buf_printf(&s, "pgscan_in_background %lu\n",
memcg_events(memcg, PGSCAN_KSWAPD));
seq_buf_printf(&s, "pgsteal_in_background %lu\n",
memcg_events(memcg, PGSTEAL_KSWAPD));
#endif
seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
memcg_events(memcg, PGACTIVATE));
seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
memcg_events(memcg, PGDEACTIVATE));
seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
memcg_events(memcg, PGLAZYFREE));
seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
memcg_events(memcg, PGLAZYFREED));
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
memcg_events(memcg, THP_FAULT_ALLOC));
seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
memcg_events(memcg, THP_COLLAPSE_ALLOC));
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#ifdef CONFIG_MEM_SPEED_THROTTLE
seq_buf_printf(&s, "mst_mem_spd_max %lu\n",
mem_cgroup_mst_get_mem_spd_max(memcg));
seq_buf_printf(&s, "mst_nr_throttled %lu\n",
atomic_long_read(&memcg->msc.nr_throttled));
#endif
/* The above should easily fit into one page */
WARN_ON_ONCE(seq_buf_has_overflowed(&s));
return s.buffer;
}
#define K(x) ((x) << (PAGE_SHIFT-10))
/**
* mem_cgroup_print_oom_context: Print OOM information relevant to
* memory controller.
* @memcg: The memory cgroup that went over limit
* @p: Task that is going to be killed
*
* NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
* enabled
*/
void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
{
rcu_read_lock();
if (memcg) {
pr_cont(",oom_memcg=");
pr_cont_cgroup_path(memcg->css.cgroup);
} else
pr_cont(",global_oom");
if (p) {
pr_cont(",task_memcg=");
pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
}
rcu_read_unlock();
}
/**
* mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
* memory controller.
* @memcg: The memory cgroup that went over limit
*/
void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
{
char *buf;
pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->memory)),
K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->swap)),
K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
else {
pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->memsw)),
K((u64)memcg->memsw.max), memcg->memsw.failcnt);
pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->kmem)),
K((u64)memcg->kmem.max), memcg->kmem.failcnt);
}
pr_info("Memory cgroup stats for ");
pr_cont_cgroup_path(memcg->css.cgroup);
pr_cont(":");
buf = memory_stat_format(memcg);
if (!buf)
return;
pr_info("%s", buf);
kfree(buf);
}
/*
* Return the memory (and swap, if configured) limit for a memcg.
*/
unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
{
unsigned long max = READ_ONCE(memcg->memory.max);
if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
if (mem_cgroup_swappiness(memcg))
max += min(READ_ONCE(memcg->swap.max),
(unsigned long)total_swap_pages);
} else { /* v1 */
if (mem_cgroup_swappiness(memcg)) {
/* Calculate swap excess capacity from memsw limit */
unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
max += min(swap, (unsigned long)total_swap_pages);
}
}
return max;
}
unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
{
return page_counter_read(&memcg->memory);
}
static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
int order)
{
struct oom_control oc = {
.zonelist = NULL,
.nodemask = NULL,
.memcg = memcg,
.gfp_mask = gfp_mask,
.order = order,
};
bool ret;
if (mutex_lock_killable(&oom_lock))
return true;
/*
* A few threads which were not waiting at mutex_lock_killable() can
* fail to bail out. Therefore, check again after holding oom_lock.
*/
ret = should_force_charge() || out_of_memory(&oc);
mutex_unlock(&oom_lock);
return ret;
}
#if MAX_NUMNODES > 1
/**
* test_mem_cgroup_node_reclaimable
* @memcg: the target memcg
* @nid: the node ID to be checked.
* @noswap : specify true here if the user wants flle only information.
*
* This function returns whether the specified memcg contains any
* reclaimable pages on a node. Returns true if there are any reclaimable
* pages in the node.
*/
static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
int nid, bool noswap)
{
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) ||
lruvec_page_state(lruvec, NR_ACTIVE_FILE))
return true;
if (noswap || !total_swap_pages)
return false;
if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) ||
lruvec_page_state(lruvec, NR_ACTIVE_ANON))
return true;
return false;
}
/*
* Always updating the nodemask is not very good - even if we have an empty
* list or the wrong list here, we can start from some node and traverse all
* nodes based on the zonelist. So update the list loosely once per 10 secs.
*
*/
static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
{
int nid;
/*
* numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
* pagein/pageout changes since the last update.
*/
if (!atomic_read(&memcg->numainfo_events))
return;
if (atomic_inc_return(&memcg->numainfo_updating) > 1)
return;
/* make a nodemask where this memcg uses memory from */
memcg->scan_nodes = node_states[N_MEMORY];
for_each_node_mask(nid, node_states[N_MEMORY]) {
if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
node_clear(nid, memcg->scan_nodes);
}
atomic_set(&memcg->numainfo_events, 0);
atomic_set(&memcg->numainfo_updating, 0);
}
/*
* Selecting a node where we start reclaim from. Because what we need is just
* reducing usage counter, start from anywhere is O,K. Considering
* memory reclaim from current node, there are pros. and cons.
*
* Freeing memory from current node means freeing memory from a node which
* we'll use or we've used. So, it may make LRU bad. And if several threads
* hit limits, it will see a contention on a node. But freeing from remote
* node means more costs for memory reclaim because of memory latency.
*
* Now, we use round-robin. Better algorithm is welcomed.
*/
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
{
int node;
mem_cgroup_may_update_nodemask(memcg);
node = memcg->last_scanned_node;
node = next_node_in(node, memcg->scan_nodes);
/*
* mem_cgroup_may_update_nodemask might have seen no reclaimmable pages
* last time it really checked all the LRUs due to rate limiting.
* Fallback to the current node in that case for simplicity.
*/
if (unlikely(node == MAX_NUMNODES))
node = numa_node_id();
memcg->last_scanned_node = node;
return node;
}
#else
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
{
return 0;
}
#endif
static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
pg_data_t *pgdat,
gfp_t gfp_mask,
unsigned long *total_scanned)
{
struct mem_cgroup *victim = NULL;
int total = 0;
int loop = 0;
unsigned long excess;
unsigned long nr_scanned;
struct mem_cgroup_reclaim_cookie reclaim = {
.pgdat = pgdat,
.priority = 0,
};
excess = soft_limit_excess(root_memcg);
while (1) {
victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
if (!victim) {
loop++;
if (loop >= 2) {
/*
* If we have not been able to reclaim
* anything, it might because there are
* no reclaimable pages under this hierarchy
*/
if (!total)
break;
/*
* We want to do more targeted reclaim.
* excess >> 2 is not to excessive so as to
* reclaim too much, nor too less that we keep
* coming back to reclaim from this cgroup
*/
if (total >= (excess >> 2) ||
(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
break;
}
continue;
}
total += mem_cgroup_shrink_node(victim, gfp_mask, false,
pgdat, &nr_scanned);
*total_scanned += nr_scanned;
if (!soft_limit_excess(root_memcg))
break;
}
mem_cgroup_iter_break(root_memcg, victim);
return total;
}
#ifdef CONFIG_LOCKDEP
static struct lockdep_map memcg_oom_lock_dep_map = {
.name = "memcg_oom_lock",
};
#endif
static DEFINE_SPINLOCK(memcg_oom_lock);
/*
* Check OOM-Killer is already running under our hierarchy.
* If someone is running, return false.
*/
static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter, *failed = NULL;
spin_lock(&memcg_oom_lock);
for_each_mem_cgroup_tree(iter, memcg) {
if (iter->oom_lock) {
/*
* this subtree of our hierarchy is already locked
* so we cannot give a lock.
*/
failed = iter;
mem_cgroup_iter_break(memcg, iter);
break;
} else
iter->oom_lock = true;
}
if (failed) {
/*
* OK, we failed to lock the whole subtree so we have
* to clean up what we set up to the failing subtree
*/
for_each_mem_cgroup_tree(iter, memcg) {
if (iter == failed) {
mem_cgroup_iter_break(memcg, iter);
break;
}
iter->oom_lock = false;
}
} else
mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
spin_unlock(&memcg_oom_lock);
return !failed;
}
static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter;
spin_lock(&memcg_oom_lock);
mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
for_each_mem_cgroup_tree(iter, memcg)
iter->oom_lock = false;
spin_unlock(&memcg_oom_lock);
}
static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter;
spin_lock(&memcg_oom_lock);
for_each_mem_cgroup_tree(iter, memcg)
iter->under_oom++;
spin_unlock(&memcg_oom_lock);
}
static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter;
/*
* When a new child is created while the hierarchy is under oom,
* mem_cgroup_oom_lock() may not be called. Watch for underflow.
*/
spin_lock(&memcg_oom_lock);
for_each_mem_cgroup_tree(iter, memcg)
if (iter->under_oom > 0)
iter->under_oom--;
spin_unlock(&memcg_oom_lock);
}
static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
struct oom_wait_info {
struct mem_cgroup *memcg;
wait_queue_entry_t wait;
};
static int memcg_oom_wake_function(wait_queue_entry_t *wait,
unsigned mode, int sync, void *arg)
{
struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
struct mem_cgroup *oom_wait_memcg;
struct oom_wait_info *oom_wait_info;
oom_wait_info = container_of(wait, struct oom_wait_info, wait);
oom_wait_memcg = oom_wait_info->memcg;
if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
!mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
return 0;
return autoremove_wake_function(wait, mode, sync, arg);
}
static void memcg_oom_recover(struct mem_cgroup *memcg)
{
/*
* For the following lockless ->under_oom test, the only required
* guarantee is that it must see the state asserted by an OOM when
* this function is called as a result of userland actions
* triggered by the notification of the OOM. This is trivially
* achieved by invoking mem_cgroup_mark_under_oom() before
* triggering notification.
*/
if (memcg && memcg->under_oom)
__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
}
enum oom_status {
OOM_SUCCESS,
OOM_FAILED,
OOM_ASYNC,
OOM_SKIPPED
};
static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
{
enum oom_status ret;
bool locked;
if (order > PAGE_ALLOC_COSTLY_ORDER)
return OOM_SKIPPED;
memcg_memory_event(memcg, MEMCG_OOM);
/*
* We are in the middle of the charge context here, so we
* don't want to block when potentially sitting on a callstack
* that holds all kinds of filesystem and mm locks.
*
* cgroup1 allows disabling the OOM killer and waiting for outside
* handling until the charge can succeed; remember the context and put
* the task to sleep at the end of the page fault when all locks are
* released.
*
* On the other hand, in-kernel OOM killer allows for an async victim
* memory reclaim (oom_reaper) and that means that we are not solely
* relying on the oom victim to make a forward progress and we can
* invoke the oom killer here.
*
* Please note that mem_cgroup_out_of_memory might fail to find a
* victim and then we have to bail out from the charge path.
*/
if (memcg->oom_kill_disable) {
if (!current->in_user_fault)
return OOM_SKIPPED;
css_get(&memcg->css);
current->memcg_in_oom = memcg;
current->memcg_oom_gfp_mask = mask;
current->memcg_oom_order = order;
return OOM_ASYNC;
}
mem_cgroup_mark_under_oom(memcg);
locked = mem_cgroup_oom_trylock(memcg);
if (locked)
mem_cgroup_oom_notify(memcg);
mem_cgroup_unmark_under_oom(memcg);
if (mem_cgroup_out_of_memory(memcg, mask, order))
ret = OOM_SUCCESS;
else
ret = OOM_FAILED;
if (locked)
mem_cgroup_oom_unlock(memcg);
return ret;
}
/**
* mem_cgroup_oom_synchronize - complete memcg OOM handling
* @handle: actually kill/wait or just clean up the OOM state
*
* This has to be called at the end of a page fault if the memcg OOM
* handler was enabled.
*
* Memcg supports userspace OOM handling where failed allocations must
* sleep on a waitqueue until the userspace task resolves the
* situation. Sleeping directly in the charge context with all kinds
* of locks held is not a good idea, instead we remember an OOM state
* in the task and mem_cgroup_oom_synchronize() has to be called at
* the end of the page fault to complete the OOM handling.
*
* Returns %true if an ongoing memcg OOM situation was detected and
* completed, %false otherwise.
*/
bool mem_cgroup_oom_synchronize(bool handle)
{
struct mem_cgroup *memcg = current->memcg_in_oom;
struct oom_wait_info owait;
bool locked;
/* OOM is global, do not handle */
if (!memcg)
return false;
if (!handle)
goto cleanup;
owait.memcg = memcg;
owait.wait.flags = 0;
owait.wait.func = memcg_oom_wake_function;
owait.wait.private = current;
INIT_LIST_HEAD(&owait.wait.entry);
prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
mem_cgroup_mark_under_oom(memcg);
locked = mem_cgroup_oom_trylock(memcg);
if (locked)
mem_cgroup_oom_notify(memcg);
if (locked && !memcg->oom_kill_disable) {
mem_cgroup_unmark_under_oom(memcg);
finish_wait(&memcg_oom_waitq, &owait.wait);
mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
current->memcg_oom_order);
} else {
schedule();
mem_cgroup_unmark_under_oom(memcg);
finish_wait(&memcg_oom_waitq, &owait.wait);
}
if (locked) {
mem_cgroup_oom_unlock(memcg);
/*
* There is no guarantee that an OOM-lock contender
* sees the wakeups triggered by the OOM kill
* uncharges. Wake any sleepers explicitely.
*/
memcg_oom_recover(memcg);
}
cleanup:
current->memcg_in_oom = NULL;
css_put(&memcg->css);
return true;
}
/**
* mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
* @victim: task to be killed by the OOM killer
* @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM
*
* Returns a pointer to a memory cgroup, which has to be cleaned up
* by killing all belonging OOM-killable tasks.
*
* Caller has to call mem_cgroup_put() on the returned non-NULL memcg.
*/
struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
struct mem_cgroup *oom_domain)
{
struct mem_cgroup *oom_group = NULL;
struct mem_cgroup *memcg;
#ifndef CONFIG_MEM_QOS
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return NULL;
#endif
if (!oom_domain)
oom_domain = root_mem_cgroup;
rcu_read_lock();
memcg = mem_cgroup_from_task(victim);
if (memcg == root_mem_cgroup)
goto out;
/*
* Traverse the memory cgroup hierarchy from the victim task's
* cgroup up to the OOMing cgroup (or root) to find the
* highest-level memory cgroup with oom.group set.
*/
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
if (memcg->oom_group)
oom_group = memcg;
if (memcg == oom_domain)
break;
}
if (oom_group)
css_get(&oom_group->css);
out:
rcu_read_unlock();
return oom_group;
}
void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
{
pr_info("Tasks in ");
pr_cont_cgroup_path(memcg->css.cgroup);
pr_cont(" are going to be killed due to memory.oom.group set\n");
}
/**
* lock_page_memcg - lock a page->mem_cgroup binding
* @page: the page
*
* This function protects unlocked LRU pages from being moved to
* another cgroup.
*
* It ensures lifetime of the returned memcg. Caller is responsible
* for the lifetime of the page; __unlock_page_memcg() is available
* when @page might get freed inside the locked section.
*/
struct mem_cgroup *lock_page_memcg(struct page *page)
{
struct page *head = compound_head(page); /* rmap on tail pages */
struct mem_cgroup *memcg;
unsigned long flags;
/*
* The RCU lock is held throughout the transaction. The fast
* path can get away without acquiring the memcg->move_lock
* because page moving starts with an RCU grace period.
*
* The RCU lock also protects the memcg from being freed when
* the page state that is going to change is the only thing
* preventing the page itself from being freed. E.g. writeback
* doesn't hold a page reference and relies on PG_writeback to
* keep off truncation, migration and so forth.
*/
rcu_read_lock();
if (mem_cgroup_disabled())
return NULL;
again:
memcg = head->mem_cgroup;
if (unlikely(!memcg))
return NULL;
#ifdef CONFIG_PROVE_LOCKING
local_irq_save(flags);
might_lock(&memcg->move_lock);
local_irq_restore(flags);
#endif
if (atomic_read(&memcg->moving_account) <= 0)
return memcg;
spin_lock_irqsave(&memcg->move_lock, flags);
if (memcg != head->mem_cgroup) {
spin_unlock_irqrestore(&memcg->move_lock, flags);
goto again;
}
/*
* When charge migration first begins, we can have locked and
* unlocked page stat updates happening concurrently. Track
* the task who has the lock for unlock_page_memcg().
*/
memcg->move_lock_task = current;
memcg->move_lock_flags = flags;
return memcg;
}
EXPORT_SYMBOL(lock_page_memcg);
/**
* __unlock_page_memcg - unlock and unpin a memcg
* @memcg: the memcg
*
* Unlock and unpin a memcg returned by lock_page_memcg().
*/
void __unlock_page_memcg(struct mem_cgroup *memcg)
{
if (memcg && memcg->move_lock_task == current) {
unsigned long flags = memcg->move_lock_flags;
memcg->move_lock_task = NULL;
memcg->move_lock_flags = 0;
spin_unlock_irqrestore(&memcg->move_lock, flags);
}
rcu_read_unlock();
}
/**
* unlock_page_memcg - unlock a page->mem_cgroup binding
* @page: the page
*/
void unlock_page_memcg(struct page *page)
{
struct page *head = compound_head(page);
__unlock_page_memcg(head->mem_cgroup);
}
EXPORT_SYMBOL(unlock_page_memcg);
struct obj_stock {
#ifdef CONFIG_MEMCG_KMEM
struct obj_cgroup *cached_objcg;
struct pglist_data *cached_pgdat;
unsigned int nr_bytes;
int nr_slab_reclaimable_b;
int nr_slab_unreclaimable_b;
#else
int dummy[0];
#endif
};
struct memcg_stock_pcp {
struct mem_cgroup *cached; /* this never be root cgroup */
unsigned int nr_pages;
struct obj_stock task_obj;
struct obj_stock irq_obj;
struct work_struct work;
unsigned long flags;
#define FLUSHING_CACHED_CHARGE 0
};
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
static DEFINE_MUTEX(percpu_charge_mutex);
#ifdef CONFIG_MEMCG_KMEM
static void drain_obj_stock(struct obj_stock *stock);
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
struct mem_cgroup *root_memcg);
#else
static inline void drain_obj_stock(struct obj_stock *stock)
{
}
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
struct mem_cgroup *root_memcg)
{
return false;
}
#endif
/*
* Most kmem_cache_alloc() calls are from user context. The irq disable/enable
* sequence used in this case to access content from object stock is slow.
* To optimize for user context access, there are now two object stocks for
* task context and interrupt context access respectively.
*
* The task context object stock can be accessed by disabling preemption only
* which is cheap in non-preempt kernel. The interrupt context object stock
* can only be accessed after disabling interrupt. User context code can
* access interrupt object stock, but not vice versa.
*/
static inline struct obj_stock *get_obj_stock(unsigned long *pflags)
{
struct memcg_stock_pcp *stock;
if (likely(in_task())) {
*pflags = 0UL;
preempt_disable();
stock = this_cpu_ptr(&memcg_stock);
return &stock->task_obj;
}
local_irq_save(*pflags);
stock = this_cpu_ptr(&memcg_stock);
return &stock->irq_obj;
}
static inline void put_obj_stock(unsigned long flags)
{
if (likely(in_task()))
preempt_enable();
else
local_irq_restore(flags);
}
/**
* consume_stock: Try to consume stocked charge on this cpu.
* @memcg: memcg to consume from.
* @nr_pages: how many pages to charge.
*
* The charges will only happen if @memcg matches the current cpu's memcg
* stock, and at least @nr_pages are available in that stock. Failure to
* service an allocation will refill the stock.
*
* returns true if successful, false otherwise.
*/
static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
struct memcg_stock_pcp *stock;
unsigned long flags;
bool ret = false;
if (nr_pages > MEMCG_CHARGE_BATCH)
return ret;
local_irq_save(flags);
stock = this_cpu_ptr(&memcg_stock);
if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
stock->nr_pages -= nr_pages;
ret = true;
}
local_irq_restore(flags);
return ret;
}
/*
* Returns stocks cached in percpu and reset cached information.
*/
static void drain_stock(struct memcg_stock_pcp *stock)
{
struct mem_cgroup *old = stock->cached;
if (!old)
return;
if (stock->nr_pages) {
page_counter_uncharge(&old->memory, stock->nr_pages);
if (do_memsw_account())
page_counter_uncharge(&old->memsw, stock->nr_pages);
stock->nr_pages = 0;
}
css_put(&old->css);
stock->cached = NULL;
}
static void drain_local_stock(struct work_struct *dummy)
{
struct memcg_stock_pcp *stock;
unsigned long flags;
/*
* The only protection from memory hotplug vs. drain_stock races is
* that we always operate on local CPU stock here with IRQ disabled
*/
local_irq_save(flags);
stock = this_cpu_ptr(&memcg_stock);
drain_obj_stock(&stock->irq_obj);
if (in_task())
drain_obj_stock(&stock->task_obj);
drain_stock(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
local_irq_restore(flags);
}
/*
* Cache charges(val) to local per_cpu area.
* This will be consumed by consume_stock() function, later.
*/
static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
struct memcg_stock_pcp *stock;
unsigned long flags;
local_irq_save(flags);
stock = this_cpu_ptr(&memcg_stock);
if (stock->cached != memcg) { /* reset if necessary */
drain_stock(stock);
css_get(&memcg->css);
stock->cached = memcg;
}
stock->nr_pages += nr_pages;
if (stock->nr_pages > MEMCG_CHARGE_BATCH)
drain_stock(stock);
local_irq_restore(flags);
}
/*
* Drains all per-CPU charge caches for given root_memcg resp. subtree
* of the hierarchy under it.
*/
void drain_all_stock(struct mem_cgroup *root_memcg)
{
int cpu, curcpu;
/* If someone's already draining, avoid adding running more workers. */
if (!mutex_trylock(&percpu_charge_mutex))
return;
/*
* Notify other cpus that system-wide "drain" is running
* We do not care about races with the cpu hotplug because cpu down
* as well as workers from this path always operate on the local
* per-cpu data. CPU up doesn't touch memcg_stock at all.
*/
curcpu = get_cpu();
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
struct mem_cgroup *memcg;
bool flush = false;
rcu_read_lock();
memcg = stock->cached;
if (memcg && stock->nr_pages &&
mem_cgroup_is_descendant(memcg, root_memcg))
flush = true;
if (obj_stock_flush_required(stock, root_memcg))
flush = true;
rcu_read_unlock();
if (flush &&
!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
if (cpu == curcpu)
drain_local_stock(&stock->work);
else
schedule_work_on(cpu, &stock->work);
}
}
put_cpu();
mutex_unlock(&percpu_charge_mutex);
}
static int memcg_hotplug_cpu_dead(unsigned int cpu)
{
struct memcg_stock_pcp *stock;
struct mem_cgroup *memcg, *mi;
stock = &per_cpu(memcg_stock, cpu);
drain_stock(stock);
for_each_mem_cgroup(memcg) {
int i;
for (i = 0; i < MEMCG_NR_STAT; i++) {
int nid;
long x;
x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
if (x)
for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
atomic_long_add(x, &memcg->vmstats[i]);
if (i >= NR_VM_NODE_STAT_ITEMS)
continue;
for_each_node(nid) {
struct mem_cgroup_per_node *pn;
pn = mem_cgroup_nodeinfo(memcg, nid);
x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
if (x)
do {
atomic_long_add(x, &pn->lruvec_stat[i]);
} while ((pn = parent_nodeinfo(pn, nid)));
}
}
for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
long x;
x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
if (x)
for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
atomic_long_add(x, &memcg->vmevents[i]);
}
}
return 0;
}
#ifdef CONFIG_MEM_QOS
static bool need_memcg_async_reclaim(struct mem_cgroup *memcg)
{
if (!sysctl_vm_memory_qos)
return false;
return page_counter_read(&memcg->memory) > memcg->memory.async_high;
}
static void async_reclaim_func(struct work_struct *work)
{
struct mem_cgroup *memcg;
unsigned long nr_pages;
memcg = container_of(work, struct mem_cgroup, async_work);
nr_pages = page_counter_read(&memcg->memory) - memcg->memory.async_low;
if (nr_pages <= 0)
return;
nr_pages = min(nr_pages, (memcg->memory.async_high - memcg->memory.async_low));
memcg_memory_event(memcg, MEMCG_HIGH);
try_to_free_mem_cgroup_pages(memcg, nr_pages, GFP_KERNEL, true);
}
#endif
static void reclaim_high(struct mem_cgroup *memcg,
unsigned int nr_pages,
gfp_t gfp_mask)
{
do {
if (page_counter_read(&memcg->memory) <= READ_ONCE(memcg->high))
continue;
memcg_memory_event(memcg, MEMCG_HIGH);
try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
} while ((memcg = parent_mem_cgroup(memcg)));
}
static void high_work_func(struct work_struct *work)
{
struct mem_cgroup *memcg;
memcg = container_of(work, struct mem_cgroup, high_work);
reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
}
/*
* Clamp the maximum sleep time per allocation batch to 2 seconds. This is
* enough to still cause a significant slowdown in most cases, while still
* allowing diagnostics and tracing to proceed without becoming stuck.
*/
#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
/*
* When calculating the delay, we use these either side of the exponentiation to
* maintain precision and scale to a reasonable number of jiffies (see the table
* below.
*
* - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
* overage ratio to a delay.
* - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down down the
* proposed penalty in order to reduce to a reasonable number of jiffies, and
* to produce a reasonable delay curve.
*
* MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
* reasonable delay curve compared to precision-adjusted overage, not
* penalising heavily at first, but still making sure that growth beyond the
* limit penalises misbehaviour cgroups by slowing them down exponentially. For
* example, with a high of 100 megabytes:
*
* +-------+------------------------+
* | usage | time to allocate in ms |
* +-------+------------------------+
* | 100M | 0 |
* | 101M | 6 |
* | 102M | 25 |
* | 103M | 57 |
* | 104M | 102 |
* | 105M | 159 |
* | 106M | 230 |
* | 107M | 313 |
* | 108M | 409 |
* | 109M | 518 |
* | 110M | 639 |
* | 111M | 774 |
* | 112M | 921 |
* | 113M | 1081 |
* | 114M | 1254 |
* | 115M | 1439 |
* | 116M | 1638 |
* | 117M | 1849 |
* | 118M | 2000 |
* | 119M | 2000 |
* | 120M | 2000 |
* +-------+------------------------+
*/
#define MEMCG_DELAY_PRECISION_SHIFT 20
#define MEMCG_DELAY_SCALING_SHIFT 14
/*
* Get the number of jiffies that we should penalise a mischievous cgroup which
* is exceeding its memory.high by checking both it and its ancestors.
*/
static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
unsigned int nr_pages)
{
unsigned long penalty_jiffies;
u64 max_overage = 0;
do {
unsigned long usage, high;
u64 overage;
usage = page_counter_read(&memcg->memory);
high = READ_ONCE(memcg->high);
if (usage <= high)
continue;
/*
* Prevent division by 0 in overage calculation by acting as if
* it was a threshold of 1 page
*/
high = max(high, 1UL);
overage = usage - high;
overage <<= MEMCG_DELAY_PRECISION_SHIFT;
overage = div64_u64(overage, high);
if (overage > max_overage)
max_overage = overage;
} while ((memcg = parent_mem_cgroup(memcg)) &&
!mem_cgroup_is_root(memcg));
if (!max_overage)
return 0;
/*
* We use overage compared to memory.high to calculate the number of
* jiffies to sleep (penalty_jiffies). Ideally this value should be
* fairly lenient on small overages, and increasingly harsh when the
* memcg in question makes it clear that it has no intention of stopping
* its crazy behaviour, so we exponentially increase the delay based on
* overage amount.
*/
penalty_jiffies = max_overage * max_overage * HZ;
penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
/*
* Factor in the task's own contribution to the overage, such that four
* N-sized allocations are throttled approximately the same as one
* 4N-sized allocation.
*
* MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
* larger the current charge patch is than that.
*/
penalty_jiffies = penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
/*
* Clamp the max delay per usermode return so as to still keep the
* application moving forwards and also permit diagnostics, albeit
* extremely slowly.
*/
return min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
}
/*
* Scheduled by try_charge() to be executed from the userland return path
* and reclaims memory over the high limit.
*/
void mem_cgroup_handle_over_high(void)
{
unsigned long penalty_jiffies;
unsigned long pflags;
unsigned int nr_pages = current->memcg_nr_pages_over_high;
struct mem_cgroup *memcg;
#ifdef CONFIG_CGROUP_SLI
u64 start;
#endif
if (likely(!nr_pages))
return;
#ifdef CONFIG_CGROUP_SLI
sli_memlat_stat_start(&start);
#endif
memcg = get_mem_cgroup_from_mm(current->mm);
reclaim_high(memcg, nr_pages, GFP_KERNEL);
current->memcg_nr_pages_over_high = 0;
/*
* memory.high is breached and reclaim is unable to keep up. Throttle
* allocators proactively to slow down excessive growth.
*/
penalty_jiffies = calculate_high_delay(memcg, nr_pages);
/*
* Don't sleep if the amount of jiffies this memcg owes us is so low
* that it's not even worth doing, in an attempt to be nice to those who
* go only a small amount over their memory.high value and maybe haven't
* been aggressively reclaimed enough yet.
*/
if (penalty_jiffies <= HZ / 100)
goto out;
/*
* If we exit early, we're guaranteed to die (since
* schedule_timeout_killable sets TASK_KILLABLE). This means we don't
* need to account for any ill-begotten jiffies to pay them off later.
*/
psi_memstall_enter(&pflags);
schedule_timeout_killable(penalty_jiffies);
psi_memstall_leave(&pflags);
out:
#ifdef CONFIG_CGROUP_SLI
sli_memlat_stat_end(MEM_LAT_MEMCG_DIRECT_RECLAIM, start);
#endif
css_put(&memcg->css);
}
#if defined(CONFIG_MEM_QOS)
static void setup_async_wmark(struct mem_cgroup *memcg)
{
unsigned long high_throttle, low_throttle, distance;
unsigned long high = cgroup_subsys_on_dfl(memory_cgrp_subsys) ?
memcg->high : memcg->memory.max;
if (memcg->async_wmark) {
high_throttle = (memcg->async_wmark * high) / ASYNC_RATIO_DIV;
distance = mult_frac(high,
memcg->async_distance_factor, ASYNC_DISTANCE_DIV);
if (distance >= high_throttle)
low_throttle = memcg->memory.low;
else
low_throttle = high_throttle - distance;
} else {
high_throttle = PAGE_COUNTER_MAX;
low_throttle = PAGE_COUNTER_MAX;
}
page_counter_set_async_high(&memcg->memory, high_throttle);
page_counter_set_async_low(&memcg->memory, low_throttle);
}
static void async_reclaim_reset_factor(struct mem_cgroup *memcg,
unsigned int new_prio)
{
unsigned int wmark, distance;
if (memcg->async_wmark_delta < 0)
return;
wmark = ASYNC_RATIO_DIV -
(CGROUP_PRIORITY_MAX - new_prio) * memcg->async_wmark_delta;
xchg(&memcg->async_wmark, wmark);
distance = memcg->async_distance_delta * (new_prio + 1);
xchg(&memcg->async_distance_factor, distance);
setup_async_wmark(memcg);
if (need_memcg_async_reclaim(memcg))
queue_work(memcg_async_reclaim_wq, &memcg->async_work);
}
static struct task_struct *memcg_priod = NULL;
static struct task_struct *memcg_priod_async = NULL;
static DECLARE_WAIT_QUEUE_HEAD(memcg_prio_reclaim_wq);
static void wakeup_memcg_priod(void)
{
if (!waitqueue_active(&memcg_prio_reclaim_wq)) // XXX check if neccessary
return;
wake_up_interruptible(&memcg_prio_reclaim_wq);
}
void memory_qos_update(void)
{
spin_lock(&memcg_reclaim_prio_lock);
if (memcg_cur_reclaim_prio > CGROUP_PRIORITY_MAX - 1)
memcg_cur_reclaim_prio = CGROUP_PRIORITY_MAX - 1;
if (memcg_cur_reclaim_prio < sysctl_vm_qos_highest_reclaim_prio)
memcg_cur_reclaim_prio = sysctl_vm_qos_highest_reclaim_prio;
spin_unlock(&memcg_reclaim_prio_lock);
wakeup_memcg_priod();
}
#define MEM_128M 128 * 1024 * 1024
#define MEM_128M_PAGES MEM_128M / PAGE_SIZE
unsigned long prio_reclaim_bytes = MEM_128M * 8;
unsigned int sysctl_vm_qos_prio_reclaim_ratio = 0;
int memory_qos_prio_reclaim_ratio_update(void)
{
u64 mem_total = totalram_pages() * PAGE_SIZE;
unsigned long new;
new = (mem_total * sysctl_vm_qos_prio_reclaim_ratio) / 100;
if (new < MEM_128M) {
printk(KERN_WARNING "mem qos: reserve mem too small\n");
return -EINVAL;
}
prio_reclaim_bytes = new;
wakeup_memcg_priod();
return 0;
}
static struct memcg_priority {
struct list_head head;
spinlock_t lock;
atomic_long_t count;
} memcg_prios[CGROUP_PRIORITY_MAX];
static struct memcg_global_reclaim {
struct list_head list;
struct mutex mutex;
} memcg_global_reclaim_list;
static int memcg_prio_hierarchy_count[CGROUP_PRIORITY_MAX + 1];
static DEFINE_RWLOCK(memcg_prio_hierarchy_lock);
static int memcg_get_prio(struct mem_cgroup *memcg)
{
return cgroup_priority(&memcg->css);
}
static bool mem_cgroup_prio_need_reclaim(struct mem_cgroup *);
static void mem_cgroup_notify_alloc(struct mem_cgroup *memcg, unsigned int nr_pages);
static bool mem_cgroup_notify_reclaim(struct mem_cgroup *memcg, unsigned int nr_pages);
static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
unsigned long max, bool memsw);
static int memcg_prio_reclaimd_run(void);
static int memcg_get_prio_hierarchy_count(int prio)
{
int ret;
read_lock(&memcg_prio_hierarchy_lock);
ret = memcg_prio_hierarchy_count[prio];
read_unlock(&memcg_prio_hierarchy_lock);
return ret;
}
static bool memcg_reclaim_prio_exist(void)
{
return !!memcg_get_prio_hierarchy_count(sysctl_vm_qos_highest_reclaim_prio);
}
static int memcg_notify_prio_change(struct mem_cgroup *memcg,
unsigned int old_prio, unsigned int new_prio)
{
struct memcg_priority *p;
int i;
if (!memcg)
return 0;
if (old_prio) {
p = &memcg_prios[old_prio];
spin_lock(&p->lock);
list_del(&memcg->prio_list);
spin_unlock(&p->lock);
atomic_long_dec(&p->count);
write_lock(&memcg_prio_hierarchy_lock);
for (i = 1; i <= old_prio; i++)
memcg_prio_hierarchy_count[i]--;
write_unlock(&memcg_prio_hierarchy_lock);
}
if (new_prio) {
p = &memcg_prios[new_prio];
spin_lock(&p->lock);
list_add(&memcg->prio_list, &p->head);
spin_unlock(&p->lock);
atomic_long_inc(&p->count);
write_lock(&memcg_prio_hierarchy_lock);
for (i = 1; i <= new_prio; i++)
memcg_prio_hierarchy_count[i]++;
write_unlock(&memcg_prio_hierarchy_lock);
wakeup_memcg_priod();
}
if (old_prio == 0 && new_prio > 0) {
mutex_lock(&memcg_global_reclaim_list.mutex);
list_add_tail_rcu(&memcg->prio_list_async, &memcg_global_reclaim_list.list);
mutex_unlock(&memcg_global_reclaim_list.mutex);
} else if (old_prio > 0 && new_prio == 0) {
mutex_lock(&memcg_global_reclaim_list.mutex);
list_del_rcu(&memcg->prio_list_async);
mutex_unlock(&memcg_global_reclaim_list.mutex);
}
return 0;
}
int mem_cgroup_notify_prio_change(struct cgroup_subsys_state *css,
u16 old_prio, u16 new_prio)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
async_reclaim_reset_factor(memcg, new_prio);
return memcg_notify_prio_change(memcg, old_prio, new_prio);
}
#else
static inline bool memcg_reclaim_prio_exist(void)
{
return false;
}
static inline bool mem_cgroup_notify_reclaim(struct mem_cgroup *memcg, unsigned int nr_pages)
{
return false;
}
static inline void mem_cgroup_notify_alloc(struct mem_cgroup *memcg, unsigned int nr_pages)
{
return;
}
static inline bool mem_cgroup_prio_need_reclaim(struct mem_cgroup *memcg)
{
return false;
}
#endif
static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int nr_pages)
{
unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
struct mem_cgroup *mem_over_limit;
struct page_counter *counter;
unsigned long nr_reclaimed;
bool may_swap = true;
bool drained = false;
enum oom_status oom_status;
#ifdef CONFIG_CGROUP_SLI
u64 start;
#endif
bool need_reclaim = sysctl_vm_memory_qos && memcg_reclaim_prio_exist();
if (mem_cgroup_is_root(memcg))
return 0;
retry:
if (consume_stock(memcg, nr_pages))
return 0;
if (!do_memsw_account() ||
page_counter_try_charge(&memcg->memsw, batch, &counter)) {
if (page_counter_try_charge(&memcg->memory, batch, &counter))
goto done_restock;
if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, batch);
mem_over_limit = mem_cgroup_from_counter(counter, memory);
} else {
mem_over_limit = mem_cgroup_from_counter(counter, memsw);
may_swap = false;
}
retry_failed_reclaim:
if (batch > nr_pages) {
batch = nr_pages;
goto retry;
}
/*
* Memcg doesn't have a dedicated reserve for atomic
* allocations. But like the global atomic pool, we need to
* put the burden of reclaim on regular allocation requests
* and let these go through as privileged allocations.
*/
if (gfp_mask & __GFP_ATOMIC)
goto force;
/*
* Unlike in global OOM situations, memcg is not in a physical
* memory shortage. Allow dying and OOM-killed tasks to
* bypass the last charges so that they can exit quickly and
* free their memory.
*/
if (unlikely(should_force_charge()))
goto force;
/*
* Prevent unbounded recursion when reclaim operations need to
* allocate memory. This might exceed the limits temporarily,
* but we prefer facilitating memory reclaim and getting back
* under the limit over triggering OOM kills in these cases.
*/
if (unlikely(current->flags & PF_MEMALLOC))
goto force;
if (unlikely(task_in_memcg_oom(current)))
goto nomem;
if (!gfpflags_allow_blocking(gfp_mask))
goto nomem;
memcg_memory_event(mem_over_limit, MEMCG_MAX);
#ifdef CONFIG_CGROUP_SLI
sli_memlat_stat_start(&start);
#endif
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
gfp_mask, may_swap);
need_reclaim = need_reclaim && mem_cgroup_notify_reclaim(mem_over_limit, nr_reclaimed);
#ifdef CONFIG_CGROUP_SLI
sli_memlat_stat_end(MEM_LAT_MEMCG_DIRECT_RECLAIM, start);
#endif
if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
goto retry;
if (!drained) {
drain_all_stock(mem_over_limit);
drained = true;
goto retry;
}
if (gfp_mask & __GFP_NORETRY)
goto nomem;
/*
* Even though the limit is exceeded at this point, reclaim
* may have been able to free some pages. Retry the charge
* before killing the task.
*
* Only for regular pages, though: huge pages are rather
* unlikely to succeed so close to the limit, and we fall back
* to regular pages anyway in case of failure.
*/
if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
goto retry;
/*
* At task move, charge accounts can be doubly counted. So, it's
* better to wait until the end of task_move if something is going on.
*/
if (mem_cgroup_wait_acct_move(mem_over_limit))
goto retry;
if (nr_retries--)
goto retry;
if (gfp_mask & __GFP_RETRY_MAYFAIL)
goto nomem;
if (gfp_mask & __GFP_NOFAIL)
goto force;
if (fatal_signal_pending(current))
goto force;
/*
* keep retrying as long as the memcg oom killer is able to make
* a forward progress or bypass the charge if the oom killer
* couldn't make any progress.
*/
oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
get_order(nr_pages * PAGE_SIZE));
switch (oom_status) {
case OOM_SUCCESS:
nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
goto retry;
case OOM_FAILED:
goto force;
default:
goto nomem;
}
nomem:
if (!(gfp_mask & __GFP_NOFAIL))
return -ENOMEM;
force:
/*
* The allocation either can't fail or will lead to more memory
* being freed very soon. Allow memory usage go over the limit
* temporarily by force charging it.
*/
page_counter_charge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_charge(&memcg->memsw, nr_pages);
if (sysctl_vm_memory_qos && memcg_reclaim_prio_exist())
mem_cgroup_notify_alloc(mem_over_limit, nr_pages);
return 0;
done_restock:
if (need_reclaim) {
need_reclaim = mem_cgroup_prio_need_reclaim(memcg);
if (need_reclaim) {
mem_over_limit = memcg;
page_counter_uncharge(&memcg->memory, batch);
if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, batch);
goto retry_failed_reclaim;
}
}
/* Try to throttle allocation speed if needed */
mem_cgroup_mst_spd_throttle(memcg);
if (batch > nr_pages)
refill_stock(memcg, batch - nr_pages);
if (sysctl_vm_memory_qos && memcg_reclaim_prio_exist())
mem_cgroup_notify_alloc(memcg, batch);
/*
* If the hierarchy is above the normal consumption range, schedule
* reclaim on returning to userland. We can perform reclaim here
* if __GFP_RECLAIM but let's always punt for simplicity and so that
* GFP_KERNEL can consistently be used during reclaim. @memcg is
* not recorded as it most likely matches current's and won't
* change in the meantime. As high limit is checked again before
* reclaim, the cost of mismatch is negligible.
*/
do {
#if defined(CONFIG_MEM_QOS)
if (need_memcg_async_reclaim(memcg)) {
/* Kick off per memory cgroup async reclaim */
queue_work(memcg_async_reclaim_wq, &memcg->async_work);
break;
}
#endif
if (page_counter_read(&memcg->memory) > READ_ONCE(memcg->high)) {
/* Don't bother a random interrupted task */
if (in_interrupt()) {
schedule_work(&memcg->high_work);
break;
}
current->memcg_nr_pages_over_high += batch;
set_notify_resume(current);
break;
}
} while ((memcg = parent_mem_cgroup(memcg)));
return 0;
}
#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU)
static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
{
if (mem_cgroup_is_root(memcg))
return;
page_counter_uncharge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, nr_pages);
}
#endif
static void commit_charge(struct page *page, struct mem_cgroup *memcg)
{
VM_BUG_ON_PAGE(page->mem_cgroup, page);
/*
* Nobody should be changing or seriously looking at
* page->mem_cgroup at this point:
*
* - the page is uncharged
*
* - the page is off-LRU
*
* - an anonymous fault has exclusive page access, except for
* a locked page table
*
* - a page cache insertion, a swapin fault, or a migration
* have the page locked
*
* mem_cgroup_trylock_pages() memcg binding stability since
* MGLRU series, this should be noticed for further backporting.
*/
page->mem_cgroup = memcg;
}
static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
{
struct mem_cgroup *memcg;
rcu_read_lock();
retry:
memcg = obj_cgroup_memcg(objcg);
if (unlikely(!css_tryget(&memcg->css)))
goto retry;
rcu_read_unlock();
return memcg;
}
#ifdef CONFIG_MEMCG_KMEM
/*
* The allocated objcg pointers array is not accounted directly.
* Moreover, it should not come from DMA buffer and is not readily
* reclaimable. So those GFP bits should be masked off.
*/
#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT)
int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
gfp_t gfp)
{
unsigned int objects = objs_per_slab_page(s, page);
void *vec;
gfp &= ~OBJCGS_CLEAR_MASK;
vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
page_to_nid(page));
if (!vec)
return -ENOMEM;
if (cmpxchg(&page->obj_cgroups, NULL,
(struct obj_cgroup **) ((unsigned long)vec | 0x1UL)))
kfree(vec);
else
kmemleak_not_leak(vec);
return 0;
}
/*
* Returns a pointer to the memory cgroup to which the kernel object is charged.
*
* The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
* cgroup_mutex, etc.
*/
struct mem_cgroup *mem_cgroup_from_obj(void *p)
{
struct page *page;
if (mem_cgroup_disabled())
return NULL;
page = virt_to_head_page(p);
/*
* If page->mem_cgroup is set, it's either a simple mem_cgroup pointer
* or a pointer to obj_cgroup vector. In the latter case the lowest
* bit of the pointer is set.
* The page->mem_cgroup pointer can be asynchronously changed
* from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed
* from a valid memcg pointer to objcg vector or back.
*/
if (!page->mem_cgroup)
return NULL;
/*
* Slab objects are accounted individually, not per-page.
* Memcg membership data for each individual object is saved in
* the page->obj_cgroups.
*/
if (page_has_obj_cgroups(page)) {
struct obj_cgroup *objcg;
unsigned int off;
off = obj_to_index(page->slab_cache, page, p);
objcg = page_obj_cgroups(page)[off];
if (objcg)
return obj_cgroup_memcg(objcg);
return NULL;
}
/* All other pages use page->mem_cgroup */
return page->mem_cgroup;
}
__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
{
struct obj_cgroup *objcg = NULL;
struct mem_cgroup *memcg;
if (unlikely(!current->mm && !current->active_memcg))
return NULL;
rcu_read_lock();
if (unlikely(current->active_memcg))
memcg = rcu_dereference(current->active_memcg);
else
memcg = mem_cgroup_from_task(current);
for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
objcg = rcu_dereference(memcg->objcg);
if (objcg && obj_cgroup_tryget(objcg))
break;
objcg = NULL;
}
rcu_read_unlock();
return objcg;
}
static int memcg_alloc_cache_id(void)
{
int id, size;
int err;
id = ida_simple_get(&memcg_cache_ida,
0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
if (id < 0)
return id;
if (id < memcg_nr_cache_ids)
return id;
/*
* There's no space for the new id in memcg_caches arrays,
* so we have to grow them.
*/
down_write(&memcg_cache_ids_sem);
size = 2 * (id + 1);
if (size < MEMCG_CACHES_MIN_SIZE)
size = MEMCG_CACHES_MIN_SIZE;
else if (size > MEMCG_CACHES_MAX_SIZE)
size = MEMCG_CACHES_MAX_SIZE;
err = memcg_update_all_list_lrus(size);
if (!err)
memcg_nr_cache_ids = size;
up_write(&memcg_cache_ids_sem);
if (err) {
ida_simple_remove(&memcg_cache_ida, id);
return err;
}
return id;
}
static void memcg_free_cache_id(int id)
{
ida_simple_remove(&memcg_cache_ida, id);
}
static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
unsigned int nr_pages)
{
struct mem_cgroup *memcg;
memcg = get_mem_cgroup_from_objcg(objcg);
__memcg_kmem_uncharge(memcg, nr_pages);
css_put(&memcg->css);
}
static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
unsigned int nr_pages)
{
struct mem_cgroup *memcg;
int ret;
memcg = get_mem_cgroup_from_objcg(objcg);
ret = __memcg_kmem_charge(memcg, gfp, nr_pages);
css_put(&memcg->css);
return ret;
}
/**
* __memcg_kmem_charge: charge a number of kernel pages to a memcg
* @memcg: memory cgroup to charge
* @gfp: reclaim mode
* @nr_pages: number of pages to charge
*
* Returns 0 on success, an error code on failure.
*/
int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
unsigned int nr_pages)
{
struct page_counter *counter;
int ret;
ret = try_charge(memcg, gfp, nr_pages);
if (ret)
return ret;
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
!page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
/*
* Enforce __GFP_NOFAIL allocation because callers are not
* prepared to see failures and likely do not have any failure
* handling code.
*/
if (gfp & __GFP_NOFAIL) {
page_counter_charge(&memcg->kmem, nr_pages);
return 0;
}
cancel_charge(memcg, nr_pages);
return -ENOMEM;
}
return 0;
}
/**
* __memcg_kmem_uncharge: uncharge a number of kernel pages from a memcg
* @memcg: memcg to uncharge
* @nr_pages: number of pages to uncharge
*/
void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
{
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
page_counter_uncharge(&memcg->kmem, nr_pages);
refill_stock(memcg, nr_pages);
}
/**
* __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
* @page: page to charge
* @gfp: reclaim mode
* @order: allocation order
*
* Returns 0 on success, an error code on failure.
*/
int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
{
struct mem_cgroup *memcg;
int ret = 0;
if (memcg_kmem_bypass())
return 0;
memcg = get_mem_cgroup_from_current();
if (!mem_cgroup_is_root(memcg)) {
ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
if (!ret) {
page->mem_cgroup = memcg;
__SetPageKmemcg(page);
return 0;
}
}
css_put(&memcg->css);
return ret;
}
/**
* __memcg_kmem_uncharge_page: uncharge a kmem page
* @page: page to uncharge
* @order: allocation order
*/
void __memcg_kmem_uncharge_page(struct page *page, int order)
{
struct mem_cgroup *memcg = page->mem_cgroup;
unsigned int nr_pages = 1 << order;
if (!memcg)
return;
VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
__memcg_kmem_uncharge(memcg, nr_pages);
page->mem_cgroup = NULL;
css_put(&memcg->css);
/* slab pages do not have PageKmemcg flag set */
if (PageKmemcg(page))
__ClearPageKmemcg(page);
}
void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
enum node_stat_item idx, int nr)
{
unsigned long flags;
struct obj_stock *stock = get_obj_stock(&flags);
int *bytes;
/*
* Save vmstat data in stock and skip vmstat array update unless
* accumulating over a page of vmstat data or when pgdat or idx
* changes.
*/
if (stock->cached_objcg != objcg) {
drain_obj_stock(stock);
obj_cgroup_get(objcg);
stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
stock->cached_objcg = objcg;
stock->cached_pgdat = pgdat;
} else if (stock->cached_pgdat != pgdat) {
/* Flush the existing cached vmstat data */
struct pglist_data *oldpg = stock->cached_pgdat;
if (stock->nr_slab_reclaimable_b) {
mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
stock->nr_slab_reclaimable_b);
stock->nr_slab_reclaimable_b = 0;
}
if (stock->nr_slab_unreclaimable_b) {
mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B,
stock->nr_slab_unreclaimable_b);
stock->nr_slab_unreclaimable_b = 0;
}
stock->cached_pgdat = pgdat;
}
bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b
: &stock->nr_slab_unreclaimable_b;
/*
* Even for large object >= PAGE_SIZE, the vmstat data will still be
* cached locally at least once before pushing it out.
*/
if (!*bytes) {
*bytes = nr;
nr = 0;
} else {
*bytes += nr;
if (abs(*bytes) > PAGE_SIZE) {
nr = *bytes;
*bytes = 0;
} else {
nr = 0;
}
}
if (nr)
mod_objcg_mlstate(objcg, pgdat, idx, nr);
put_obj_stock(flags);
}
static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
{
unsigned long flags;
struct obj_stock *stock = get_obj_stock(&flags);
bool ret = false;
if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
stock->nr_bytes -= nr_bytes;
ret = true;
}
put_obj_stock(flags);
return ret;
}
static void drain_obj_stock(struct obj_stock *stock)
{
struct obj_cgroup *old = stock->cached_objcg;
if (!old)
return;
if (stock->nr_bytes) {
unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
if (nr_pages)
obj_cgroup_uncharge_pages(old, nr_pages);
/*
* The leftover is flushed to the centralized per-memcg value.
* On the next attempt to refill obj stock it will be moved
* to a per-cpu stock (probably, on an other CPU), see
* refill_obj_stock().
*
* How often it's flushed is a trade-off between the memory
* limit enforcement accuracy and potential CPU contention,
* so it might be changed in the future.
*/
atomic_add(nr_bytes, &old->nr_charged_bytes);
stock->nr_bytes = 0;
}
/*
* Flush the vmstat data in current stock
*/
if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) {
if (stock->nr_slab_reclaimable_b) {
mod_objcg_mlstate(old, stock->cached_pgdat,
NR_SLAB_RECLAIMABLE_B,
stock->nr_slab_reclaimable_b);
stock->nr_slab_reclaimable_b = 0;
}
if (stock->nr_slab_unreclaimable_b) {
mod_objcg_mlstate(old, stock->cached_pgdat,
NR_SLAB_UNRECLAIMABLE_B,
stock->nr_slab_unreclaimable_b);
stock->nr_slab_unreclaimable_b = 0;
}
stock->cached_pgdat = NULL;
}
obj_cgroup_put(old);
stock->cached_objcg = NULL;
}
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
struct mem_cgroup *root_memcg)
{
struct mem_cgroup *memcg;
if (in_task() && stock->task_obj.cached_objcg) {
memcg = obj_cgroup_memcg(stock->task_obj.cached_objcg);
if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
return true;
}
if (stock->irq_obj.cached_objcg) {
memcg = obj_cgroup_memcg(stock->irq_obj.cached_objcg);
if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
return true;
}
return false;
}
static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
bool allow_uncharge)
{
unsigned long flags;
struct obj_stock *stock = get_obj_stock(&flags);
unsigned int nr_pages = 0;
if (stock->cached_objcg != objcg) { /* reset if necessary */
drain_obj_stock(stock);
obj_cgroup_get(objcg);
stock->cached_objcg = objcg;
stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
allow_uncharge = true; /* Allow uncharge when objcg changes */
}
stock->nr_bytes += nr_bytes;
if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) {
nr_pages = stock->nr_bytes >> PAGE_SHIFT;
stock->nr_bytes &= (PAGE_SIZE - 1);
}
put_obj_stock(flags);
if (nr_pages)
obj_cgroup_uncharge_pages(objcg, nr_pages);
}
int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
{
unsigned int nr_pages, nr_bytes;
int ret;
if (consume_obj_stock(objcg, size))
return 0;
/*
* In theory, objcg->nr_charged_bytes can have enough
* pre-charged bytes to satisfy the allocation. However,
* flushing objcg->nr_charged_bytes requires two atomic
* operations, and objcg->nr_charged_bytes can't be big.
* The shared objcg->nr_charged_bytes can also become a
* performance bottleneck if all tasks of the same memcg are
* trying to update it. So it's better to ignore it and try
* grab some new pages. The stock's nr_bytes will be flushed to
* objcg->nr_charged_bytes later on when objcg changes.
*
* The stock's nr_bytes may contain enough pre-charged bytes
* to allow one less page from being charged, but we can't rely
* on the pre-charged bytes not being changed outside of
* consume_obj_stock() or refill_obj_stock(). So ignore those
* pre-charged bytes as well when charging pages. To avoid a
* page uncharge right after a page charge, we set the
* allow_uncharge flag to false when calling refill_obj_stock()
* to temporarily allow the pre-charged bytes to exceed the page
* size limit. The maximum reachable value of the pre-charged
* bytes is (sizeof(object) + PAGE_SIZE - 2) if there is no data
* race.
*/
nr_pages = size >> PAGE_SHIFT;
nr_bytes = size & (PAGE_SIZE - 1);
if (nr_bytes)
nr_pages += 1;
ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages);
if (!ret && nr_bytes)
refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false);
return ret;
}
void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
{
refill_obj_stock(objcg, size, true);
}
#endif /* CONFIG_MEMCG_KMEM */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
* Because page_memcg(head) is not set on compound tails, set it now.
*/
void mem_cgroup_split_huge_fixup(struct page *head)
{
struct mem_cgroup *memcg = head->mem_cgroup;
int i;
if (mem_cgroup_disabled())
return;
for (i = 1; i < HPAGE_PMD_NR; i++) {
css_get(&memcg->css);
head[i].mem_cgroup = memcg;
}
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#ifdef CONFIG_MEMCG_SWAP
/**
* mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
* @entry: swap entry to be moved
* @from: mem_cgroup which the entry is moved from
* @to: mem_cgroup which the entry is moved to
*
* It succeeds only when the swap_cgroup's record for this entry is the same
* as the mem_cgroup's id of @from.
*
* Returns 0 on success, -EINVAL on failure.
*
* The caller must have charged to @to, IOW, called page_counter_charge() about
* both res and memsw, and called css_get().
*/
static int mem_cgroup_move_swap_account(swp_entry_t entry,
struct mem_cgroup *from, struct mem_cgroup *to)
{
unsigned short old_id, new_id;
old_id = mem_cgroup_id(from);
new_id = mem_cgroup_id(to);
if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
mod_memcg_state(from, MEMCG_SWAP, -1);
mod_memcg_state(to, MEMCG_SWAP, 1);
return 0;
}
return -EINVAL;
}
#else
static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
struct mem_cgroup *from, struct mem_cgroup *to)
{
return -EINVAL;
}
#endif
#ifdef CONFIG_MEM_QOS
static void pagecache_set_limit(struct mem_cgroup *memcg);
#endif
static DEFINE_MUTEX(memcg_max_mutex);
static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
unsigned long max, bool memsw)
{
bool enlarge = false;
bool drained = false;
int ret;
bool limits_invariant;
struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
do {
if (signal_pending(current)) {
ret = -EINTR;
break;
}
mutex_lock(&memcg_max_mutex);
/*
* Make sure that the new limit (memsw or memory limit) doesn't
* break our basic invariant rule memory.max <= memsw.max.
*/
limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
max <= memcg->memsw.max;
if (!limits_invariant) {
mutex_unlock(&memcg_max_mutex);
ret = -EINVAL;
break;
}
if (max > counter->max)
enlarge = true;
ret = page_counter_set_max(counter, max);
mutex_unlock(&memcg_max_mutex);
if (!ret)
break;
if (!drained) {
drain_all_stock(memcg);
drained = true;
continue;
}
if (!try_to_free_mem_cgroup_pages(memcg, 1,
GFP_KERNEL, !memsw)) {
ret = -EBUSY;
break;
}
} while (true);
#ifdef CONFIG_MEM_QOS
if (!ret) {
setup_async_wmark(memcg);
if (need_memcg_async_reclaim(memcg))
queue_work(memcg_async_reclaim_wq, &memcg->async_work);
if (enlarge)
memcg_oom_recover(memcg);
pagecache_set_limit(memcg);
}
#else
if (!ret && enlarge)
memcg_oom_recover(memcg);
#endif
return ret;
}
unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
gfp_t gfp_mask,
unsigned long *total_scanned)
{
unsigned long nr_reclaimed = 0;
struct mem_cgroup_per_node *mz, *next_mz = NULL;
unsigned long reclaimed;
int loop = 0;
struct mem_cgroup_tree_per_node *mctz;
unsigned long excess;
unsigned long nr_scanned;
if (order > 0)
return 0;
mctz = soft_limit_tree_node(pgdat->node_id);
/*
* Do not even bother to check the largest node if the root
* is empty. Do it lockless to prevent lock bouncing. Races
* are acceptable as soft limit is best effort anyway.
*/
if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
return 0;
/*
* This loop can run a while, specially if mem_cgroup's continuously
* keep exceeding their soft limit and putting the system under
* pressure
*/
do {
if (next_mz)
mz = next_mz;
else
mz = mem_cgroup_largest_soft_limit_node(mctz);
if (!mz)
break;
nr_scanned = 0;
reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
gfp_mask, &nr_scanned);
nr_reclaimed += reclaimed;
*total_scanned += nr_scanned;
spin_lock_irq(&mctz->lock);
__mem_cgroup_remove_exceeded(mz, mctz);
/*
* If we failed to reclaim anything from this memory cgroup
* it is time to move on to the next cgroup
*/
next_mz = NULL;
if (!reclaimed)
next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
excess = soft_limit_excess(mz->memcg);
/*
* One school of thought says that we should not add
* back the node to the tree if reclaim returns 0.
* But our reclaim could return 0, simply because due
* to priority we are exposing a smaller subset of
* memory to reclaim from. Consider this as a longer
* term TODO.
*/
/* If excess == 0, no tree ops */
__mem_cgroup_insert_exceeded(mz, mctz, excess);
spin_unlock_irq(&mctz->lock);
css_put(&mz->memcg->css);
loop++;
/*
* Could not reclaim anything and there are no more
* mem cgroups to try or we seem to be looping without
* reclaiming anything.
*/
if (!nr_reclaimed &&
(next_mz == NULL ||
loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
break;
} while (!nr_reclaimed);
if (next_mz)
css_put(&next_mz->memcg->css);
return nr_reclaimed;
}
/*
* Reclaims as many pages from the given memcg as possible.
*
* Caller is responsible for holding css reference for memcg.
*/
static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
{
int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
/* we call try-to-free pages for make this cgroup empty */
lru_add_drain_all();
drain_all_stock(memcg);
/* try to free all pages in this cgroup */
while (nr_retries && page_counter_read(&memcg->memory)) {
int progress;
if (signal_pending(current))
return -EINTR;
progress = try_to_free_mem_cgroup_pages(memcg, 1,
GFP_KERNEL, true);
if (!progress) {
nr_retries--;
/* maybe some writeback is necessary */
congestion_wait(BLK_RW_ASYNC, HZ/10);
}
}
return 0;
}
static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
if (mem_cgroup_is_root(memcg))
return -EINVAL;
return mem_cgroup_force_empty(memcg) ?: nbytes;
}
static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
return mem_cgroup_from_css(css)->use_hierarchy;
}
static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
int retval = 0;
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
if (memcg->use_hierarchy == val)
return 0;
/*
* If parent's use_hierarchy is set, we can't make any modifications
* in the child subtrees. If it is unset, then the change can
* occur, provided the current cgroup has no children.
*
* For the root cgroup, parent_mem is NULL, we allow value to be
* set if there are no children.
*/
if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
(val == 1 || val == 0)) {
if (!memcg_has_children(memcg))
memcg->use_hierarchy = val;
else
retval = -EBUSY;
} else
retval = -EINVAL;
return retval;
}
#ifdef CONFIG_MEM_QOS
#define MIN_PAGECACHE_PAGES 16
unsigned int vm_pagecache_limit_retry_times __read_mostly = MEMCG_PAGECACHE_RETRIES;
void mem_cgroup_shrink_pagecache(struct mem_cgroup *memcg, gfp_t gfp_mask)
{
long pages_reclaimed;
unsigned long pages_used, pages_max, goal_pages_used, pre_used;
unsigned int retry_times = 0;
unsigned int limit_retry_times;
u32 max_ratio;
if (!memcg || mem_cgroup_is_root(memcg))
return;
max_ratio = READ_ONCE(memcg->pagecache_max_ratio);
if (max_ratio == PAGECACHE_MAX_RATIO_MAX)
return;
pages_max = READ_ONCE(memcg->pagecache.max);
if (pages_max == PAGE_COUNTER_MAX || vm_pagecache_limit_global || !sysctl_vm_memory_qos)
return;
if (gfp_mask & __GFP_ATOMIC)
return;
if (unlikely(should_force_charge()))
return;
if (unlikely(current->flags & PF_MEMALLOC))
return;
if (unlikely(task_in_memcg_oom(current)))
return;
if (!gfpflags_allow_blocking(gfp_mask))
return;
pages_used = page_counter_read(&memcg->pagecache);
limit_retry_times = READ_ONCE(vm_pagecache_limit_retry_times);
goal_pages_used = (100 - READ_ONCE(memcg->pagecache_reclaim_ratio)) * pages_max / 100;
goal_pages_used = max_t(unsigned long, MIN_PAGECACHE_PAGES, goal_pages_used);
if (pages_used > pages_max)
memcg_memory_event(memcg, MEMCG_PAGECACHE_MAX);
while (pages_used > goal_pages_used) {
if (fatal_signal_pending(current))
break;
pre_used = pages_used;
pages_reclaimed = shrink_page_cache_memcg(gfp_mask, memcg, pages_used - goal_pages_used);
if (pages_reclaimed == -EINVAL)
return;
if (limit_retry_times == 0)
goto next_shrink;
if (pages_reclaimed == 0) {
congestion_wait(BLK_RW_ASYNC, HZ/10);
retry_times++;
} else
retry_times = 0;
if (retry_times > limit_retry_times) {
printk(KERN_WARNING "Attempts to recycle many times have not recovered enough pages.\n");
break;
}
next_shrink:
pages_used = page_counter_read(&memcg->pagecache);
cond_resched();
}
}
static u64 pagecache_reclaim_ratio_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
return memcg->pagecache_reclaim_ratio;
}
static ssize_t pagecache_reclaim_ratio_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
u64 reclaim_ratio;
int ret;
unsigned long nr_pages;
if (!sysctl_vm_memory_qos) {
printk(KERN_WARNING "you should open vm.memory_qos.\n");
return -EINVAL;
}
if (vm_pagecache_limit_global) {
printk(KERN_WARNING "you should clear vm_pagecache_limit_global.\n");
return -EINVAL;
}
buf = strstrip(buf);
if (!buf)
return -EINVAL;
ret = kstrtou64(buf, 0, &reclaim_ratio);
if (ret)
return ret;
if ((reclaim_ratio > 0) && (reclaim_ratio < 100)) {
memcg->pagecache_reclaim_ratio = reclaim_ratio;
mem_cgroup_shrink_pagecache(memcg, GFP_KERNEL);
return nbytes;
} else if (reclaim_ratio == 100) {
nr_pages = page_counter_read(&memcg->pagecache);
//try reclaim once
shrink_page_cache_memcg(GFP_KERNEL, memcg, nr_pages);
return nbytes;
}
return -EINVAL;
}
static u64 mem_cgroup_priority_oom_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
return memcg->use_priority_oom;
}
static int mem_cgroup_priority_oom_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
if (val >1)
return -EINVAL;
memcg->use_priority_oom = val;
return 0;
}
static u64 pagecache_current_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
return (u64)page_counter_read(&memcg->pagecache) * PAGE_SIZE;
}
static u64 memory_pagecache_max_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
return memcg->pagecache_max_ratio;
}
unsigned long mem_cgroup_pagecache_get_reclaim_pages(struct mem_cgroup *memcg)
{
unsigned long goal_pages_used, pages_used, pages_max;
if ((!memcg) || (mem_cgroup_is_root(memcg)))
return 0;
pages_max = READ_ONCE(memcg->pagecache.max);
if (pages_max == PAGE_COUNTER_MAX)
return 0;
goal_pages_used = (100 - READ_ONCE(memcg->pagecache_reclaim_ratio)) * pages_max / 100;
goal_pages_used = max_t(unsigned long, MIN_PAGECACHE_PAGES, goal_pages_used);
pages_used = page_counter_read(&memcg->pagecache);
return pages_used > pages_max ? pages_used - goal_pages_used : 0;
}
static void pagecache_set_limit(struct mem_cgroup *memcg)
{
unsigned long max, pages_max;
u32 max_ratio;
pages_max = READ_ONCE(memcg->memory.max);
max_ratio = READ_ONCE(memcg->pagecache_max_ratio);
max = ((pages_max * max_ratio) / 100);
xchg(&memcg->pagecache.max, max);
}
static ssize_t memory_pagecache_max_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
unsigned int nr_reclaims = vm_pagecache_limit_retry_times;
unsigned long max;
long pages_reclaimed;
int ret = 0;
u64 max_ratio, old;
if (!sysctl_vm_memory_qos) {
printk(KERN_WARNING "you should open vm.memory_qos.\n");
return -EINVAL;
}
if (vm_pagecache_limit_global) {
printk(KERN_WARNING "you should clear vm_pagecache_limit_global.\n");
return -EINVAL;
}
if (!buf)
return -EINVAL;
ret = kstrtou64(buf, 0, &max_ratio);
if (ret)
return ret;
if (max_ratio > PAGECACHE_MAX_RATIO_MAX ||
max_ratio < PAGECACHE_MAX_RATIO_MIN)
return -EINVAL;
if (READ_ONCE(memcg->memory.max) == PAGE_COUNTER_MAX) {
printk(KERN_WARNING "pagecache limit not allowed for cgroup without memory limit set\n");
return -EPERM;
}
old = READ_ONCE(memcg->pagecache_max_ratio);
memcg->pagecache_max_ratio = max_ratio;
pagecache_set_limit(memcg);
max = READ_ONCE(memcg->pagecache.max);
for (;;) {
unsigned long pages_used = page_counter_read(&memcg->pagecache);
if (pages_used <= max)
break;
if (fatal_signal_pending(current)) {
ret = -EINTR;
break;
}
if (nr_reclaims) {
pages_reclaimed = shrink_page_cache_memcg(GFP_KERNEL, memcg, mem_cgroup_pagecache_get_reclaim_pages(memcg));
if (pages_reclaimed == -EINVAL) {
printk(KERN_WARNING "you should clear vm_pagecache_limit_global.\n");
return -EINVAL;
}
if (pages_reclaimed == 0) {
congestion_wait(BLK_RW_ASYNC, HZ/10);
nr_reclaims--;
cond_resched();
} else
nr_reclaims = vm_pagecache_limit_retry_times;
continue;
}
memcg->pagecache_max_ratio = old;
pagecache_set_limit(memcg);
printk(KERN_WARNING "Attempts to recycle many times have not recovered enough pages.\n");
return -EINVAL;
}
return ret ? : nbytes;
}
#endif
static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
{
unsigned long val;
if (mem_cgroup_is_root(memcg)) {
val = memcg_page_state(memcg, NR_FILE_PAGES) +
memcg_page_state(memcg, NR_ANON_MAPPED);
if (swap)
val += memcg_page_state(memcg, MEMCG_SWAP);
#if defined(CONFIG_NEED_MEMCG_ZRAM)
else
val += memcg_page_state(memcg, MEMCG_ZRAM_B) / PAGE_SIZE;
#endif
} else {
if (!swap)
val = page_counter_read(&memcg->memory);
else
val = page_counter_read(&memcg->memsw);
}
return val;
}
enum {
RES_USAGE,
RES_LIMIT,
RES_MAX_USAGE,
RES_FAILCNT,
RES_SOFT_LIMIT,
#ifdef CONFIG_MEM_QOS
ASYNC_HIGH_LIMIT,
ASYNC_LOW_LIMIT,
#endif
};
static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct page_counter *counter;
switch (MEMFILE_TYPE(cft->private)) {
case _MEM:
counter = &memcg->memory;
break;
case _MEMSWAP:
counter = &memcg->memsw;
break;
case _KMEM:
counter = &memcg->kmem;
break;
case _TCP:
counter = &memcg->tcpmem;
break;
default:
BUG();
}
switch (MEMFILE_ATTR(cft->private)) {
case RES_USAGE:
if (counter == &memcg->memory)
return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
if (counter == &memcg->memsw)
return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
return (u64)page_counter_read(counter) * PAGE_SIZE;
case RES_LIMIT:
return (u64)counter->max * PAGE_SIZE;
case RES_MAX_USAGE:
return (u64)counter->watermark * PAGE_SIZE;
case RES_FAILCNT:
return counter->failcnt;
case RES_SOFT_LIMIT:
return (u64)memcg->soft_limit * PAGE_SIZE;
#ifdef CONFIG_MEM_QOS
case ASYNC_HIGH_LIMIT:
return (u64)counter->async_high * PAGE_SIZE;
case ASYNC_LOW_LIMIT:
return (u64)counter->async_low * PAGE_SIZE;
#endif
default:
BUG();
}
}
static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
{
unsigned long stat[MEMCG_NR_STAT] = {0};
struct mem_cgroup *mi;
int node, cpu, i;
for_each_online_cpu(cpu)
for (i = 0; i < MEMCG_NR_STAT; i++)
stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
for (i = 0; i < MEMCG_NR_STAT; i++)
atomic_long_add(stat[i], &mi->vmstats[i]);
for_each_node(node) {
struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
struct mem_cgroup_per_node *pi;
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
stat[i] = 0;
for_each_online_cpu(cpu)
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
stat[i] += per_cpu(
pn->lruvec_stat_cpu->count[i], cpu);
for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
atomic_long_add(stat[i], &pi->lruvec_stat[i]);
}
}
static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
{
unsigned long events[NR_VM_EVENT_ITEMS];
struct mem_cgroup *mi;
int cpu, i;
for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
events[i] = 0;
for_each_online_cpu(cpu)
for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
events[i] += per_cpu(memcg->vmstats_percpu->events[i],
cpu);
for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
atomic_long_add(events[i], &mi->vmevents[i]);
}
#ifdef CONFIG_MEMCG_KMEM
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
struct obj_cgroup *objcg;
int memcg_id;
if (cgroup_memory_nokmem)
return 0;
BUG_ON(memcg->kmemcg_id >= 0);
BUG_ON(memcg->kmem_state);
memcg_id = memcg_alloc_cache_id();
if (memcg_id < 0)
return memcg_id;
objcg = obj_cgroup_alloc();
if (!objcg) {
memcg_free_cache_id(memcg_id);
return -ENOMEM;
}
objcg->memcg = memcg;
rcu_assign_pointer(memcg->objcg, objcg);
static_branch_enable(&memcg_kmem_enabled_key);
/*
* A memory cgroup is considered kmem-online as soon as it gets
* kmemcg_id. Setting the id after enabling static branching will
* guarantee no one starts accounting before all call sites are
* patched.
*/
memcg->kmemcg_id = memcg_id;
memcg->kmem_state = KMEM_ONLINE;
return 0;
}
static void memcg_offline_kmem(struct mem_cgroup *memcg)
{
struct cgroup_subsys_state *css;
struct mem_cgroup *parent, *child;
int kmemcg_id;
if (memcg->kmem_state != KMEM_ONLINE)
return;
memcg->kmem_state = KMEM_ALLOCATED;
parent = parent_mem_cgroup(memcg);
if (!parent)
parent = root_mem_cgroup;
memcg_reparent_objcgs(memcg, parent);
kmemcg_id = memcg->kmemcg_id;
BUG_ON(kmemcg_id < 0);
/*
* Change kmemcg_id of this cgroup and all its descendants to the
* parent's id, and then move all entries from this cgroup's list_lrus
* to ones of the parent. After we have finished, all list_lrus
* corresponding to this cgroup are guaranteed to remain empty. The
* ordering is imposed by list_lru_node->lock taken by
* memcg_drain_all_list_lrus().
*/
rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */
css_for_each_descendant_pre(css, &memcg->css) {
child = mem_cgroup_from_css(css);
BUG_ON(child->kmemcg_id != kmemcg_id);
child->kmemcg_id = parent->kmemcg_id;
if (!memcg->use_hierarchy)
break;
}
rcu_read_unlock();
memcg_drain_all_list_lrus(kmemcg_id, parent);
memcg_free_cache_id(kmemcg_id);
}
static void memcg_free_kmem(struct mem_cgroup *memcg)
{
/* css_alloc() failed, offlining didn't happen */
if (unlikely(memcg->kmem_state == KMEM_ONLINE))
memcg_offline_kmem(memcg);
}
#else
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
return 0;
}
static void memcg_offline_kmem(struct mem_cgroup *memcg)
{
}
static void memcg_free_kmem(struct mem_cgroup *memcg)
{
}
#endif /* CONFIG_MEMCG_KMEM */
static int memcg_update_kmem_max(struct mem_cgroup *memcg,
unsigned long max)
{
int ret;
mutex_lock(&memcg_max_mutex);
ret = page_counter_set_max(&memcg->kmem, max);
mutex_unlock(&memcg_max_mutex);
return ret;
}
static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
{
int ret;
mutex_lock(&memcg_max_mutex);
ret = page_counter_set_max(&memcg->tcpmem, max);
if (ret)
goto out;
if (!memcg->tcpmem_active) {
/*
* The active flag needs to be written after the static_key
* update. This is what guarantees that the socket activation
* function is the last one to run. See mem_cgroup_sk_alloc()
* for details, and note that we don't mark any socket as
* belonging to this memcg until that flag is up.
*
* We need to do this, because static_keys will span multiple
* sites, but we can't control their order. If we mark a socket
* as accounted, but the accounting functions are not patched in
* yet, we'll lose accounting.
*
* We never race with the readers in mem_cgroup_sk_alloc(),
* because when this value change, the code to process it is not
* patched in yet.
*/
static_branch_inc(&memcg_sockets_enabled_key);
memcg->tcpmem_active = true;
}
out:
mutex_unlock(&memcg_max_mutex);
return ret;
}
/*
* The user of this function is...
* RES_LIMIT.
*/
static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
unsigned long nr_pages;
int ret;
buf = strstrip(buf);
ret = page_counter_memparse(buf, "-1", &nr_pages);
if (ret)
return ret;
switch (MEMFILE_ATTR(of_cft(of)->private)) {
case RES_LIMIT:
if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
ret = -EINVAL;
break;
}
switch (MEMFILE_TYPE(of_cft(of)->private)) {
case _MEM:
ret = mem_cgroup_resize_max(memcg, nr_pages, false);
break;
case _MEMSWAP:
ret = mem_cgroup_resize_max(memcg, nr_pages, true);
break;
case _KMEM:
pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
"Please report your usecase to linux-mm@kvack.org if you "
"depend on this functionality.\n");
ret = memcg_update_kmem_max(memcg, nr_pages);
break;
case _TCP:
ret = memcg_update_tcp_max(memcg, nr_pages);
break;
}
break;
case RES_SOFT_LIMIT:
memcg->soft_limit = nr_pages;
ret = 0;
break;
}
return ret ?: nbytes;
}
static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
struct page_counter *counter;
switch (MEMFILE_TYPE(of_cft(of)->private)) {
case _MEM:
counter = &memcg->memory;
break;
case _MEMSWAP:
counter = &memcg->memsw;
break;
case _KMEM:
counter = &memcg->kmem;
break;
case _TCP:
counter = &memcg->tcpmem;
break;
default:
BUG();
}
switch (MEMFILE_ATTR(of_cft(of)->private)) {
case RES_MAX_USAGE:
page_counter_reset_watermark(counter);
break;
case RES_FAILCNT:
counter->failcnt = 0;
break;
default:
BUG();
}
return nbytes;
}
static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
return mem_cgroup_from_css(css)->move_charge_at_immigrate;
}
#ifdef CONFIG_MMU
static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. "
"Please report your usecase to linux-mm@kvack.org if you "
"depend on this functionality.\n");
if (val & ~MOVE_MASK)
return -EINVAL;
/*
* No kind of locking is needed in here, because ->can_attach() will
* check this value once in the beginning of the process, and then carry
* on with stale data. This means that changes to this value will only
* affect task migrations starting after the change.
*/
memcg->move_charge_at_immigrate = val;
return 0;
}
#else
static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
return -ENOSYS;
}
#endif
#ifdef CONFIG_NUMA
#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
int nid, unsigned int lru_mask)
{
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
unsigned long nr = 0;
enum lru_list lru;
VM_BUG_ON((unsigned)nid >= nr_node_ids);
for_each_lru(lru) {
if (!(BIT(lru) & lru_mask))
continue;
nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
}
return nr;
}
static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
unsigned int lru_mask)
{
unsigned long nr = 0;
enum lru_list lru;
for_each_lru(lru) {
if (!(BIT(lru) & lru_mask))
continue;
nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
}
return nr;
}
static int memcg_numa_stat_show(struct seq_file *m, void *v)
{
struct numa_stat {
const char *name;
unsigned int lru_mask;
};
static const struct numa_stat stats[] = {
{ "total", LRU_ALL },
{ "file", LRU_ALL_FILE },
{ "anon", LRU_ALL_ANON },
{ "unevictable", BIT(LRU_UNEVICTABLE) },
};
const struct numa_stat *stat;
int nid;
unsigned long nr;
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
seq_printf(m, "%s=%lu", stat->name, nr);
cond_resched();
for_each_node_state(nid, N_MEMORY) {
nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
stat->lru_mask);
seq_printf(m, " N%d=%lu", nid, nr);
cond_resched();
}
seq_putc(m, '\n');
}
for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
struct mem_cgroup *iter;
nr = 0;
for_each_mem_cgroup_tree(iter, memcg) {
nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
cond_resched();
}
seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
for_each_node_state(nid, N_MEMORY) {
nr = 0;
for_each_mem_cgroup_tree(iter, memcg) {
nr += mem_cgroup_node_nr_lru_pages(
iter, nid, stat->lru_mask);
cond_resched();
}
seq_printf(m, " N%d=%lu", nid, nr);
}
seq_putc(m, '\n');
}
return 0;
}
#endif /* CONFIG_NUMA */
static const unsigned int memcg1_stats[] = {
NR_FILE_PAGES,
NR_ANON_MAPPED,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
NR_ANON_THPS,
#endif
NR_SHMEM,
NR_FILE_MAPPED,
NR_FILE_DIRTY,
NR_WRITEBACK,
MEMCG_SWAP,
};
static const char *const memcg1_stat_names[] = {
"cache",
"rss",
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
"rss_huge",
#endif
"shmem",
"mapped_file",
"dirty",
"writeback",
"swap",
};
/* Universal VM events cgroup1 shows, original sort order */
static const unsigned int memcg1_events[] = {
PGPGIN,
PGPGOUT,
PGFAULT,
PGMAJFAULT,
};
static int memcg_stat_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
unsigned long memory, memsw;
struct mem_cgroup *mi;
unsigned int i;
BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
unsigned long nr;
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
nr = memcg_page_state_local(memcg, memcg1_stats[i]);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (memcg1_stats[i] == NR_ANON_THPS)
nr *= HPAGE_PMD_NR;
#endif
seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
memcg_events_local(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++)
seq_printf(m, "%s %lu\n", lru_list_name(i),
memcg_page_state_local(memcg, NR_LRU_BASE + i) *
PAGE_SIZE);
/* Hierarchical information */
memory = memsw = PAGE_COUNTER_MAX;
for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
memory = min(memory, READ_ONCE(mi->memory.max));
memsw = min(memsw, READ_ONCE(mi->memsw.max));
}
seq_printf(m, "hierarchical_memory_limit %llu\n",
(u64)memory * PAGE_SIZE);
if (do_memsw_account())
seq_printf(m, "hierarchical_memsw_limit %llu\n",
(u64)memsw * PAGE_SIZE);
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
unsigned long nr;
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
nr = memcg_page_state(memcg, memcg1_stats[i]);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (memcg1_stats[i] == NR_ANON_THPS)
nr *= HPAGE_PMD_NR;
#endif
seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
(u64)nr * PAGE_SIZE);
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
seq_printf(m, "total_%s %llu\n",
vm_event_name(memcg1_events[i]),
(u64)memcg_events(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++)
seq_printf(m, "total_%s %llu\n", lru_list_name(i),
(u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
PAGE_SIZE);
#ifdef CONFIG_DEBUG_VM
{
pg_data_t *pgdat;
struct mem_cgroup_per_node *mz;
unsigned long anon_cost = 0;
unsigned long file_cost = 0;
for_each_online_pgdat(pgdat) {
mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
anon_cost += mz->lruvec.anon_cost;
file_cost += mz->lruvec.file_cost;
}
seq_printf(m, "anon_cost %lu\n", anon_cost);
seq_printf(m, "file_cost %lu\n", file_cost);
}
#endif
#ifdef CONFIG_MEM_QOS
seq_printf(m, "pgscan_in_background %lu\n",
memcg_events(memcg, PGSCAN_KSWAPD));
seq_printf(m, "pgsteal_in_background %lu\n",
memcg_events(memcg, PGSTEAL_KSWAPD));
#endif
seq_printf(m, "workingset_refault_anon %lu\n",
memcg_page_state(memcg, WORKINGSET_REFAULT_ANON));
seq_printf(m, "workingset_refault_file %lu\n",
memcg_page_state(memcg, WORKINGSET_REFAULT_FILE));
seq_printf(m, "workingset_activate_anon %lu\n",
memcg_page_state(memcg, WORKINGSET_ACTIVATE_ANON));
seq_printf(m, "workingset_activate_file %lu\n",
memcg_page_state(memcg, WORKINGSET_ACTIVATE_FILE));
seq_printf(m, "workingset_restore_anon %lu\n",
memcg_page_state(memcg, WORKINGSET_RESTORE_ANON));
seq_printf(m, "workingset_restore_file %lu\n",
memcg_page_state(memcg, WORKINGSET_RESTORE_FILE));
seq_printf(m, "workingset_nodereclaim %lu\n",
memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
#ifdef CONFIG_WORKINGSET_EVICT_EVAL
{
int nid;
long workingset_distance_last = 0;
long workingset_distance_avg[3] = {0};
long workingset_watermark_last = 0;
long workingset_watermark_avg[3] = {0};
for_each_node_state(nid, N_MEMORY) {
struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
workingset_distance_last += READ_ONCE(lruvec->workingset_refault_distance_last);
workingset_distance_avg[0] += READ_ONCE(lruvec->workingset_refault_distance_avg[0]);
workingset_distance_avg[1] += READ_ONCE(lruvec->workingset_refault_distance_avg[1]);
workingset_distance_avg[2] += READ_ONCE(lruvec->workingset_refault_distance_avg[2]);
workingset_watermark_last += READ_ONCE(lruvec->workingset_watermark_last);
workingset_watermark_avg[0] += READ_ONCE(lruvec->workingset_watermark_avg[0]);
workingset_watermark_avg[1] += READ_ONCE(lruvec->workingset_watermark_avg[1]);
workingset_watermark_avg[2] += READ_ONCE(lruvec->workingset_watermark_avg[2]);
}
seq_printf(m, "workingset_refault_distance_last %lu\n",
workingset_distance_last);
seq_printf(m, "workingset_refault_distance_avg_1m %lu\n",
workingset_distance_avg[0]);
seq_printf(m, "workingset_refault_distance_avg_10m %lu\n",
workingset_distance_avg[1]);
seq_printf(m, "workingset_refault_distance_avg_30m %lu\n",
workingset_distance_avg[2]);
seq_printf(m, "workingset_watermark_last %lu\n",
workingset_watermark_last);
seq_printf(m, "workingset_watermark_avg_1m %lu\n",
workingset_watermark_avg[0]);
seq_printf(m, "workingset_watermark_avg_10m %lu\n",
workingset_watermark_avg[1]);
seq_printf(m, "workingset_watermark_avg_30m %lu\n",
workingset_watermark_avg[2]);
}
seq_printf(m, "workingset_valid_eviction_last %lu\n",
READ_ONCE(memcg->workingset_valid_eviction_last));
seq_printf(m, "workingset_valid_eviction_avg_1m %lu\n",
READ_ONCE(memcg->workingset_valid_eviction_avg[0]));
seq_printf(m, "workingset_valid_eviction_avg_10m %lu\n",
READ_ONCE(memcg->workingset_valid_eviction_avg[1]));
seq_printf(m, "workingset_valid_eviction_avg_30m %lu\n",
READ_ONCE(memcg->workingset_valid_eviction_avg[2]));
#endif
mem_cgroup_mst_show_mem_spd_max(memcg, m);
mem_cgroup_mst_show_nr_throttled(memcg, m);
return 0;
}
static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
return mem_cgroup_swappiness(memcg);
}
static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
if (val > 200)
return -EINVAL;
if (css->parent)
memcg->swappiness = val;
else
vm_swappiness = val;
return 0;
}
static int mem_cgroup_swappiness_traverse_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
struct mem_cgroup *target_memcg = mem_cgroup_from_css(css);
struct mem_cgroup *memcg;
if (val > 200)
return -EINVAL;
memcg = mem_cgroup_iter(target_memcg, NULL, NULL);
do {
if (mem_cgroup_is_root(memcg))
vm_swappiness = val;
else
memcg->swappiness = val;
} while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
return 0;
}
#ifdef CONFIG_MEM_QOS
static u64 mem_cgroup_latency_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
return memcg->latency;
}
static int mem_cgroup_latency_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
/* zero means that the latency QOS is not enabled. */
memcg->latency = val;
return 0;
}
#endif
#ifdef CONFIG_MEM_QOS
/* Print the latency histogram of task exit */
static void memory_latency_hist_print(struct seq_file *m, struct mem_cgroup *memcg)
{
unsigned int step =0;
#define PTITLE(m, n) seq_printf(m, " 2^%2d(ms)", n)
for (step = 0; step < MEM_LATENCY_HIST_MAX; step++)
PTITLE(m, step);
seq_printf(m, "\n");
#undef PTITLE
for (step = 0; step < MEM_LATENCY_HIST_MAX; step++) {
seq_printf(m, "%10u", memcg->exit_latency_hist[step]);
memcg->exit_latency_hist[step] = 0;
}
seq_printf(m, "\n");
seq_printf(m, "max latency(ns): %lu\n", memcg->max_exit_latency);
memcg->max_exit_latency = 0;
}
static int memory_exit_latency_stat_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
memory_latency_hist_print(m, memcg);
return 0;
}
/* In ms*/
unsigned int total_throttle_time = 100;
void memcg_throttle(void)
{
unsigned int ms = current->mem_throttle_ms;
unsigned long flags;
if (likely(!ms))
return;
psi_memstall_enter(&flags);
msleep_interruptible(ms);
psi_memstall_leave(&flags);
current->mem_throttle_ms = 0;
}
extern struct mutex cgroup_mutex;
static void memcg_update_tree_ratio(struct mem_cgroup *memcg, int val)
{
struct mem_cgroup *pmemcg, *cmemcg;
mutex_lock(&cgroup_mutex);
memcg->wmark_ratio = val;
for_each_mem_cgroup_tree(cmemcg, memcg) {
val = cmemcg->wmark_ratio;
pmemcg = parent_mem_cgroup(cmemcg);
if (pmemcg && pmemcg->wmark_ratio && val < pmemcg->wmark_ratio) {
val = pmemcg->wmark_ratio;
cmemcg->wmark_ratio = val;
}
}
mutex_unlock(&cgroup_mutex);
}
static ssize_t memcg_wmark_ratio_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
int wmark_ratio, ret;
buf = strstrip(buf);
if (!buf)
return -EINVAL;
ret = kstrtoint(buf, 0, &wmark_ratio);
if (ret)
return ret;
if (wmark_ratio < WMARK_RATIO_MIN || wmark_ratio > WMARK_RATIO_MAX)
return -EINVAL;
/* Guarantee wmark_ratio must higher than its parent */
memcg_update_tree_ratio(memcg, wmark_ratio);
return nbytes;
}
static int memcg_wmark_ratio_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
seq_printf(m, "%d\n", memcg->wmark_ratio);
return 0;
}
#endif
#ifdef CONFIG_ASYNC_FORK
static DEFINE_MUTEX(async_fork_write_lock);
static u64 mem_cgroup_async_fork_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
return memcg->async_fork;
}
static int mem_cgroup_async_fork_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
u64 enable = !!val;
mutex_lock(&async_fork_write_lock);
if (memcg->async_fork == enable) {
mutex_unlock(&async_fork_write_lock);
return 0;
}
#ifdef CONFIG_XFORK
if (memcg->kabi_reserved1)
return -EINVAL;
#endif
if (enable)
static_branch_inc(&async_fork_enabled_key);
else
static_branch_dec(&async_fork_enabled_key);
memcg->async_fork = enable;
mutex_unlock(&async_fork_write_lock);
return 0;
}
#endif
static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
{
struct mem_cgroup_threshold_ary *t;
unsigned long usage;
int i;
rcu_read_lock();
if (!swap)
t = rcu_dereference(memcg->thresholds.primary);
else
t = rcu_dereference(memcg->memsw_thresholds.primary);
if (!t)
goto unlock;
usage = mem_cgroup_usage(memcg, swap);
/*
* current_threshold points to threshold just below or equal to usage.
* If it's not true, a threshold was crossed after last
* call of __mem_cgroup_threshold().
*/
i = t->current_threshold;
/*
* Iterate backward over array of thresholds starting from
* current_threshold and check if a threshold is crossed.
* If none of thresholds below usage is crossed, we read
* only one element of the array here.
*/
for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
eventfd_signal(t->entries[i].eventfd, 1);
/* i = current_threshold + 1 */
i++;
/*
* Iterate forward over array of thresholds starting from
* current_threshold+1 and check if a threshold is crossed.
* If none of thresholds above usage is crossed, we read
* only one element of the array here.
*/
for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
eventfd_signal(t->entries[i].eventfd, 1);
/* Update current_threshold */
t->current_threshold = i - 1;
unlock:
rcu_read_unlock();
}
static void mem_cgroup_threshold(struct mem_cgroup *memcg)
{
while (memcg) {
__mem_cgroup_threshold(memcg, false);
if (do_memsw_account())
__mem_cgroup_threshold(memcg, true);
memcg = parent_mem_cgroup(memcg);
}
}
static int compare_thresholds(const void *a, const void *b)
{
const struct mem_cgroup_threshold *_a = a;
const struct mem_cgroup_threshold *_b = b;
if (_a->threshold > _b->threshold)
return 1;
if (_a->threshold < _b->threshold)
return -1;
return 0;
}
static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
{
struct mem_cgroup_eventfd_list *ev;
spin_lock(&memcg_oom_lock);
list_for_each_entry(ev, &memcg->oom_notify, list)
eventfd_signal(ev->eventfd, 1);
spin_unlock(&memcg_oom_lock);
return 0;
}
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter;
for_each_mem_cgroup_tree(iter, memcg)
mem_cgroup_oom_notify_cb(iter);
}
static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd, const char *args, enum res_type type)
{
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
unsigned long threshold;
unsigned long usage;
int i, size, ret;
ret = page_counter_memparse(args, "-1", &threshold);
if (ret)
return ret;
mutex_lock(&memcg->thresholds_lock);
if (type == _MEM) {
thresholds = &memcg->thresholds;
usage = mem_cgroup_usage(memcg, false);
} else if (type == _MEMSWAP) {
thresholds = &memcg->memsw_thresholds;
usage = mem_cgroup_usage(memcg, true);
} else
BUG();
/* Check if a threshold crossed before adding a new one */
if (thresholds->primary)
__mem_cgroup_threshold(memcg, type == _MEMSWAP);
size = thresholds->primary ? thresholds->primary->size + 1 : 1;
/* Allocate memory for new array of thresholds */
new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
if (!new) {
ret = -ENOMEM;
goto unlock;
}
new->size = size;
/* Copy thresholds (if any) to new array */
if (thresholds->primary) {
memcpy(new->entries, thresholds->primary->entries, (size - 1) *
sizeof(struct mem_cgroup_threshold));
}
/* Add new threshold */
new->entries[size - 1].eventfd = eventfd;
new->entries[size - 1].threshold = threshold;
/* Sort thresholds. Registering of new threshold isn't time-critical */
sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
compare_thresholds, NULL);
/* Find current threshold */
new->current_threshold = -1;
for (i = 0; i < size; i++) {
if (new->entries[i].threshold <= usage) {
/*
* new->current_threshold will not be used until
* rcu_assign_pointer(), so it's safe to increment
* it here.
*/
++new->current_threshold;
} else
break;
}
/* Free old spare buffer and save old primary buffer as spare */
kfree(thresholds->spare);
thresholds->spare = thresholds->primary;
rcu_assign_pointer(thresholds->primary, new);
/* To be sure that nobody uses thresholds */
synchronize_rcu();
unlock:
mutex_unlock(&memcg->thresholds_lock);
return ret;
}
static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd, const char *args)
{
return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
}
static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd, const char *args)
{
return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
}
static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd, enum res_type type)
{
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
unsigned long usage;
int i, j, size, entries;
mutex_lock(&memcg->thresholds_lock);
if (type == _MEM) {
thresholds = &memcg->thresholds;
usage = mem_cgroup_usage(memcg, false);
} else if (type == _MEMSWAP) {
thresholds = &memcg->memsw_thresholds;
usage = mem_cgroup_usage(memcg, true);
} else
BUG();
if (!thresholds->primary)
goto unlock;
/* Check if a threshold crossed before removing */
__mem_cgroup_threshold(memcg, type == _MEMSWAP);
/* Calculate new number of threshold */
size = entries = 0;
for (i = 0; i < thresholds->primary->size; i++) {
if (thresholds->primary->entries[i].eventfd != eventfd)
size++;
else
entries++;
}
new = thresholds->spare;
/* If no items related to eventfd have been cleared, nothing to do */
if (!entries)
goto unlock;
/* Set thresholds array to NULL if we don't have thresholds */
if (!size) {
kfree(new);
new = NULL;
goto swap_buffers;
}
new->size = size;
/* Copy thresholds and find current threshold */
new->current_threshold = -1;
for (i = 0, j = 0; i < thresholds->primary->size; i++) {
if (thresholds->primary->entries[i].eventfd == eventfd)
continue;
new->entries[j] = thresholds->primary->entries[i];
if (new->entries[j].threshold <= usage) {
/*
* new->current_threshold will not be used
* until rcu_assign_pointer(), so it's safe to increment
* it here.
*/
++new->current_threshold;
}
j++;
}
swap_buffers:
/* Swap primary and spare array */
thresholds->spare = thresholds->primary;
rcu_assign_pointer(thresholds->primary, new);
/* To be sure that nobody uses thresholds */
synchronize_rcu();
/* If all events are unregistered, free the spare array */
if (!new) {
kfree(thresholds->spare);
thresholds->spare = NULL;
}
unlock:
mutex_unlock(&memcg->thresholds_lock);
}
static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd)
{
return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
}
static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd)
{
return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
}
static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd, const char *args)
{
struct mem_cgroup_eventfd_list *event;
event = kmalloc(sizeof(*event), GFP_KERNEL);
if (!event)
return -ENOMEM;
spin_lock(&memcg_oom_lock);
event->eventfd = eventfd;
list_add(&event->list, &memcg->oom_notify);
/* already in OOM ? */
if (memcg->under_oom)
eventfd_signal(eventfd, 1);
spin_unlock(&memcg_oom_lock);
return 0;
}
static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd)
{
struct mem_cgroup_eventfd_list *ev, *tmp;
spin_lock(&memcg_oom_lock);
list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
if (ev->eventfd == eventfd) {
list_del(&ev->list);
kfree(ev);
}
}
spin_unlock(&memcg_oom_lock);
}
static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
seq_printf(sf, "oom_kill %lu\n",
atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
return 0;
}
static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
/* cannot set to root cgroup and only 0 and 1 are allowed */
if (!css->parent || !((val == 0) || (val == 1)))
return -EINVAL;
memcg->oom_kill_disable = val;
if (!val)
memcg_oom_recover(memcg);
return 0;
}
#ifdef CONFIG_CGROUP_WRITEBACK
#include <trace/events/writeback.h>
static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
{
return wb_domain_init(&memcg->cgwb_domain, gfp);
}
static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
{
wb_domain_exit(&memcg->cgwb_domain);
}
static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
{
wb_domain_size_changed(&memcg->cgwb_domain);
}
struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
if (!memcg->css.parent)
return NULL;
return &memcg->cgwb_domain;
}
/*
* idx can be of type enum memcg_stat_item or node_stat_item.
* Keep in sync with memcg_exact_page().
*/
static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
{
long x = atomic_long_read(&memcg->vmstats[idx]);
int cpu;
for_each_online_cpu(cpu)
x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
if (x < 0)
x = 0;
return x;
}
/**
* mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
* @wb: bdi_writeback in question
* @pfilepages: out parameter for number of file pages
* @pheadroom: out parameter for number of allocatable pages according to memcg
* @pdirty: out parameter for number of dirty pages
* @pwriteback: out parameter for number of pages under writeback
*
* Determine the numbers of file, headroom, dirty, and writeback pages in
* @wb's memcg. File, dirty and writeback are self-explanatory. Headroom
* is a bit more involved.
*
* A memcg's headroom is "min(max, high) - used". In the hierarchy, the
* headroom is calculated as the lowest headroom of itself and the
* ancestors. Note that this doesn't consider the actual amount of
* available memory in the system. The caller should further cap
* *@pheadroom accordingly.
*/
void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
unsigned long *pheadroom, unsigned long *pdirty,
unsigned long *pwriteback)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
struct mem_cgroup *parent;
*pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
/* this should eventually include NR_UNSTABLE_NFS */
*pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
*pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
*pheadroom = PAGE_COUNTER_MAX;
while ((parent = parent_mem_cgroup(memcg))) {
unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
READ_ONCE(memcg->high));
unsigned long used = page_counter_read(&memcg->memory);
if (memcg != mem_cgroup_from_css(wb->memcg_css)) {
unsigned long file, dirty, writeback;
file = memcg_exact_page_state(memcg, NR_FILE_PAGES);
dirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
writeback = memcg_exact_page_state(memcg, NR_WRITEBACK);
used -= file - dirty - writeback;
}
*pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
memcg = parent;
}
}
/*
* Foreign dirty flushing
*
* There's an inherent mismatch between memcg and writeback. The former
* trackes ownership per-page while the latter per-inode. This was a
* deliberate design decision because honoring per-page ownership in the
* writeback path is complicated, may lead to higher CPU and IO overheads
* and deemed unnecessary given that write-sharing an inode across
* different cgroups isn't a common use-case.
*
* Combined with inode majority-writer ownership switching, this works well
* enough in most cases but there are some pathological cases. For
* example, let's say there are two cgroups A and B which keep writing to
* different but confined parts of the same inode. B owns the inode and
* A's memory is limited far below B's. A's dirty ratio can rise enough to
* trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
* triggering background writeback. A will be slowed down without a way to
* make writeback of the dirty pages happen.
*
* Conditions like the above can lead to a cgroup getting repatedly and
* severely throttled after making some progress after each
* dirty_expire_interval while the underyling IO device is almost
* completely idle.
*
* Solving this problem completely requires matching the ownership tracking
* granularities between memcg and writeback in either direction. However,
* the more egregious behaviors can be avoided by simply remembering the
* most recent foreign dirtying events and initiating remote flushes on
* them when local writeback isn't enough to keep the memory clean enough.
*
* The following two functions implement such mechanism. When a foreign
* page - a page whose memcg and writeback ownerships don't match - is
* dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
* bdi_writeback on the page owning memcg. When balance_dirty_pages()
* decides that the memcg needs to sleep due to high dirty ratio, it calls
* mem_cgroup_flush_foreign() which queues writeback on the recorded
* foreign bdi_writebacks which haven't expired. Both the numbers of
* recorded bdi_writebacks and concurrent in-flight foreign writebacks are
* limited to MEMCG_CGWB_FRN_CNT.
*
* The mechanism only remembers IDs and doesn't hold any object references.
* As being wrong occasionally doesn't matter, updates and accesses to the
* records are lockless and racy.
*/
void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
struct bdi_writeback *wb)
{
struct mem_cgroup *memcg = page->mem_cgroup;
struct memcg_cgwb_frn *frn;
u64 now = get_jiffies_64();
u64 oldest_at = now;
int oldest = -1;
int i;
trace_track_foreign_dirty(page, wb);
/*
* Pick the slot to use. If there is already a slot for @wb, keep
* using it. If not replace the oldest one which isn't being
* written out.
*/
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
frn = &memcg->cgwb_frn[i];
if (frn->bdi_id == wb->bdi->id &&
frn->memcg_id == wb->memcg_css->id)
break;
if (time_before64(frn->at, oldest_at) &&
atomic_read(&frn->done.cnt) == 1) {
oldest = i;
oldest_at = frn->at;
}
}
if (i < MEMCG_CGWB_FRN_CNT) {
/*
* Re-using an existing one. Update timestamp lazily to
* avoid making the cacheline hot. We want them to be
* reasonably up-to-date and significantly shorter than
* dirty_expire_interval as that's what expires the record.
* Use the shorter of 1s and dirty_expire_interval / 8.
*/
unsigned long update_intv =
min_t(unsigned long, HZ,
msecs_to_jiffies(dirty_expire_interval * 10) / 8);
if (time_before64(frn->at, now - update_intv))
frn->at = now;
} else if (oldest >= 0) {
/* replace the oldest free one */
frn = &memcg->cgwb_frn[oldest];
frn->bdi_id = wb->bdi->id;
frn->memcg_id = wb->memcg_css->id;
frn->at = now;
}
}
/* issue foreign writeback flushes for recorded foreign dirtying events */
void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
u64 now = jiffies_64;
int i;
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
/*
* If the record is older than dirty_expire_interval,
* writeback on it has already started. No need to kick it
* off again. Also, don't start a new one if there's
* already one in flight.
*/
if (time_after64(frn->at, now - intv) &&
atomic_read(&frn->done.cnt) == 1) {
frn->at = 0;
trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
WB_REASON_FOREIGN_FLUSH,
&frn->done);
}
}
}
#else /* CONFIG_CGROUP_WRITEBACK */
static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
{
return 0;
}
static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
{
}
static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
{
}
#endif /* CONFIG_CGROUP_WRITEBACK */
/*
* DO NOT USE IN NEW FILES.
*
* "cgroup.event_control" implementation.
*
* This is way over-engineered. It tries to support fully configurable
* events for each user. Such level of flexibility is completely
* unnecessary especially in the light of the planned unified hierarchy.
*
* Please deprecate this and replace with something simpler if at all
* possible.
*/
/*
* Unregister event and free resources.
*
* Gets called from workqueue.
*/
static void memcg_event_remove(struct work_struct *work)
{
struct mem_cgroup_event *event =
container_of(work, struct mem_cgroup_event, remove);
struct mem_cgroup *memcg = event->memcg;
remove_wait_queue(event->wqh, &event->wait);
event->unregister_event(memcg, event->eventfd);
/* Notify userspace the event is going away. */
eventfd_signal(event->eventfd, 1);
eventfd_ctx_put(event->eventfd);
kfree(event);
css_put(&memcg->css);
}
/*
* Gets called on EPOLLHUP on eventfd when user closes it.
*
* Called with wqh->lock held and interrupts disabled.
*/
static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
int sync, void *key)
{
struct mem_cgroup_event *event =
container_of(wait, struct mem_cgroup_event, wait);
struct mem_cgroup *memcg = event->memcg;
__poll_t flags = key_to_poll(key);
if (flags & EPOLLHUP) {
/*
* If the event has been detached at cgroup removal, we
* can simply return knowing the other side will cleanup
* for us.
*
* We can't race against event freeing since the other
* side will require wqh->lock via remove_wait_queue(),
* which we hold.
*/
spin_lock(&memcg->event_list_lock);
if (!list_empty(&event->list)) {
list_del_init(&event->list);
/*
* We are in atomic context, but cgroup_event_remove()
* may sleep, so we have to call it in workqueue.
*/
schedule_work(&event->remove);
}
spin_unlock(&memcg->event_list_lock);
}
return 0;
}
static void memcg_event_ptable_queue_proc(struct file *file,
wait_queue_head_t *wqh, poll_table *pt)
{
struct mem_cgroup_event *event =
container_of(pt, struct mem_cgroup_event, pt);
event->wqh = wqh;
add_wait_queue(wqh, &event->wait);
}
/*
* DO NOT USE IN NEW FILES.
*
* Parse input and register new cgroup event handler.
*
* Input must be in format '<event_fd> <control_fd> <args>'.
* Interpretation of args is defined by control file implementation.
*/
static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct cgroup_subsys_state *css = of_css(of);
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_event *event;
struct cgroup_subsys_state *cfile_css;
unsigned int efd, cfd;
struct fd efile;
struct fd cfile;
struct dentry *cdentry;
const char *name;
char *endp;
int ret;
buf = strstrip(buf);
efd = simple_strtoul(buf, &endp, 10);
if (*endp != ' ')
return -EINVAL;
buf = endp + 1;
cfd = simple_strtoul(buf, &endp, 10);
if ((*endp != ' ') && (*endp != '\0'))
return -EINVAL;
buf = endp + 1;
event = kzalloc(sizeof(*event), GFP_KERNEL);
if (!event)
return -ENOMEM;
event->memcg = memcg;
INIT_LIST_HEAD(&event->list);
init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
init_waitqueue_func_entry(&event->wait, memcg_event_wake);
INIT_WORK(&event->remove, memcg_event_remove);
efile = fdget(efd);
if (!efile.file) {
ret = -EBADF;
goto out_kfree;
}
event->eventfd = eventfd_ctx_fileget(efile.file);
if (IS_ERR(event->eventfd)) {
ret = PTR_ERR(event->eventfd);
goto out_put_efile;
}
cfile = fdget(cfd);
if (!cfile.file) {
ret = -EBADF;
goto out_put_eventfd;
}
/* the process need read permission on control file */
/* AV: shouldn't we check that it's been opened for read instead? */
ret = inode_permission(file_inode(cfile.file), MAY_READ);
if (ret < 0)
goto out_put_cfile;
/*
* The control file must be a regular cgroup1 file. As a regular cgroup
* file can't be renamed, it's safe to access its name afterwards.
*/
cdentry = cfile.file->f_path.dentry;
if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
ret = -EINVAL;
goto out_put_cfile;
}
/*
* Determine the event callbacks and set them in @event. This used
* to be done via struct cftype but cgroup core no longer knows
* about these events. The following is crude but the whole thing
* is for compatibility anyway.
*
* DO NOT ADD NEW FILES.
*/
name = cdentry->d_name.name;
if (!strcmp(name, "memory.usage_in_bytes")) {
event->register_event = mem_cgroup_usage_register_event;
event->unregister_event = mem_cgroup_usage_unregister_event;
} else if (!strcmp(name, "memory.oom_control")) {
event->register_event = mem_cgroup_oom_register_event;
event->unregister_event = mem_cgroup_oom_unregister_event;
} else if (!strcmp(name, "memory.pressure_level")) {
event->register_event = vmpressure_register_event;
event->unregister_event = vmpressure_unregister_event;
} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
event->register_event = memsw_cgroup_usage_register_event;
event->unregister_event = memsw_cgroup_usage_unregister_event;
} else {
ret = -EINVAL;
goto out_put_cfile;
}
/*
* Verify @cfile should belong to @css. Also, remaining events are
* automatically removed on cgroup destruction but the removal is
* asynchronous, so take an extra ref on @css.
*/
cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
&memory_cgrp_subsys);
ret = -EINVAL;
if (IS_ERR(cfile_css))
goto out_put_cfile;
if (cfile_css != css) {
css_put(cfile_css);
goto out_put_cfile;
}
ret = event->register_event(memcg, event->eventfd, buf);
if (ret)
goto out_put_css;
vfs_poll(efile.file, &event->pt);
spin_lock(&memcg->event_list_lock);
list_add(&event->list, &memcg->event_list);
spin_unlock(&memcg->event_list_lock);
fdput(cfile);
fdput(efile);
return nbytes;
out_put_css:
css_put(css);
out_put_cfile:
fdput(cfile);
out_put_eventfd:
eventfd_ctx_put(event->eventfd);
out_put_efile:
fdput(efile);
out_kfree:
kfree(event);
return ret;
}
static u64 memcg_meminfo_recursive_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
return mem_cgroup_from_css(css)->meminfo_recursive;
}
static int memcg_meminfo_recursive_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
int retval = 0;
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
if (memcg->meminfo_recursive == val)
return 0;
if (val == 1 || val == 0)
memcg->meminfo_recursive = val;
else
retval = -EINVAL;
return retval;
}
static inline unsigned long memcg_read_stat(struct mem_cgroup *memcg,
int idx)
{
unsigned long val = 0;
struct mem_cgroup *iter;
for_each_mem_cgroup_tree(iter, memcg)
val += memcg_page_state(iter, idx);
return val;
}
static unsigned long memcg_nr_lru_pages(struct mem_cgroup *memcg,
unsigned int lru_mask)
{
unsigned long nr = 0;
struct mem_cgroup *iter;
for_each_mem_cgroup_tree(iter, memcg)
nr += mem_cgroup_nr_lru_pages(iter, lru_mask);
return nr;
}
static int mem_cgroup_meminfo_read_comm(struct seq_file *m, void *v, struct mem_cgroup *memcg)
{
u64 mem_limit, mem_usage;
u64 mem_swap, mem_swap_usage;
unsigned long mem_cache, mem_swap_cache;
unsigned long mem_active, mem_inactive;
unsigned long mem_active_anon, mem_inactive_anon;
unsigned long mem_active_file, mem_inactive_file;
unsigned long mem_unevictable;
unsigned long mem_rss, mem_rss_huge;
unsigned long mem_file_map, mem_shmem;
mem_limit = memcg->memory.max;
/* if limit not set, use host ram total size*/
if (mem_limit == PAGE_COUNTER_MAX)
mem_limit = totalram_pages() * PAGE_SIZE;
else
mem_limit = mem_limit * PAGE_SIZE;
mem_usage = (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
mem_swap = memcg->memsw.max;
if (mem_swap == PAGE_COUNTER_MAX)
mem_swap = total_swap_pages * PAGE_SIZE;
else
mem_swap = mem_swap * PAGE_SIZE;
mem_swap_usage = (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE - mem_usage;
if (!memcg->meminfo_recursive) {
mem_cache = memcg_page_state(memcg, NR_FILE_PAGES);
mem_swap_cache = memcg_page_state(memcg, MEMCG_SWAP);
mem_active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON)) +
mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
mem_inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON)) +
mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
mem_active_anon = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
mem_inactive_anon = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
mem_active_file = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
mem_inactive_file = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
mem_unevictable = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
mem_rss = memcg_page_state(memcg, NR_ANON_MAPPED);
mem_rss_huge = memcg_page_state(memcg, NR_ANON_THPS);
mem_file_map = memcg_page_state(memcg, NR_FILE_MAPPED);
mem_shmem = 0;
} else {
mem_cache = memcg_read_stat(memcg, NR_FILE_PAGES);
mem_swap_cache = memcg_read_stat(memcg, MEMCG_SWAP);
mem_active = memcg_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON)) +
memcg_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
mem_inactive = memcg_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON)) +
memcg_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
mem_active_anon = memcg_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
mem_inactive_anon = memcg_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
mem_active_file = memcg_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
mem_inactive_file = memcg_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
mem_unevictable = memcg_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
mem_rss = memcg_read_stat(memcg, NR_ANON_MAPPED);
mem_rss_huge = memcg_read_stat(memcg, NR_ANON_THPS);
mem_file_map = memcg_read_stat(memcg, NR_FILE_MAPPED);
mem_shmem = memcg_read_stat(memcg, NR_SHMEM);
}
/*
* Tagged format, for easy grepping and expansion.
*/
seq_printf(m,
"MemTotal: %8lu kB\n"
"MemFree: %8lu kB\n"
"Buffers: %8lu kB\n"
"Cached: %8lu kB\n"
"SwapCached: %8lu kB\n"
"Active: %8lu kB\n"
"Inactive: %8lu kB\n"
"Active(anon): %8lu kB\n"
"Inactive(anon): %8lu kB\n"
"Active(file): %8lu kB\n"
"Inactive(file): %8lu kB\n"
"Unevictable: %8lu kB\n"
"Mlocked: %8lu kB\n"
#ifdef CONFIG_HIGHMEM
"HighTotal: %8lu kB\n"
"HighFree: %8lu kB\n"
"LowTotal: %8lu kB\n"
"LowFree: %8lu kB\n"
#endif
#ifndef CONFIG_MMU
"MmapCopy: %8lu kB\n"
#endif
"SwapTotal: %8lu kB\n"
"SwapFree: %8lu kB\n"
"Dirty: %8lu kB\n"
"Writeback: %8lu kB\n"
"AnonPages: %8lu kB\n"
"Mapped: %8lu kB\n"
"Shmem: %8lu kB\n"
"Slab: %8lu kB\n"
"SReclaimable: %8lu kB\n"
"SUnreclaim: %8lu kB\n"
"KernelStack: %8lu kB\n"
"PageTables: %8lu kB\n"
#ifdef CONFIG_QUICKLIST
"Quicklists: %8lu kB\n"
#endif
"NFS_Unstable: %8lu kB\n"
"Bounce: %8lu kB\n"
"WritebackTmp: %8lu kB\n"
"CommitLimit: %8lu kB\n"
"Committed_AS: %8lu kB\n"
"VmallocTotal: %8lu kB\n"
"VmallocUsed: %8lu kB\n"
"VmallocChunk: %8lu kB\n"
#ifdef CONFIG_MEMORY_FAILURE
"HardwareCorrupted: %5lu kB\n"
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
"AnonHugePages: %8lu kB\n"
#endif
,
(unsigned long)(mem_limit / 1024),
(unsigned long)((mem_limit - mem_usage) / 1024),
(unsigned long)0,
K(mem_cache),
K(mem_swap_cache),
K(mem_active),//K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]),
K(mem_inactive),//K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]),
K(mem_active_anon),//K(pages[LRU_ACTIVE_ANON]),
K(mem_inactive_anon),//K(pages[LRU_INACTIVE_ANON]),
K(mem_active_file),//K(pages[LRU_ACTIVE_FILE]),
K(mem_inactive_file),//K(pages[LRU_INACTIVE_FILE]),
K(mem_unevictable),//K(pages[LRU_UNEVICTABLE]),
(unsigned long)0,//K(global_page_state(NR_MLOCK)),
#ifdef CONFIG_HIGHMEM
(unsigned long)0,//K(i.totalhigh),
(unsigned long)0,//K(i.freehigh),
(unsigned long)0,//K(i.totalram-i.totalhigh),
(unsigned long)0,//K(i.freeram-i.freehigh),
#endif
#ifndef CONFIG_MMU
(unsigned long)0,//K((unsigned long) atomic_long_read(&mmap_pages_allocated)),
#endif
(unsigned long)(mem_swap / 1024),
(unsigned long)((mem_swap - mem_swap_usage) / 1024),
(unsigned long)0,//K(global_page_state(NR_FILE_DIRTY)),
(unsigned long)0,//K(global_page_state(NR_WRITEBACK)),
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
K(mem_rss + mem_rss_huge),
//K(global_page_state(NR_ANON_PAGES)
// + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
//HPAGE_PMD_NR),
#else
K(mem_rss), //K(global_page_state(NR_ANON_PAGES)),
#endif
K(mem_file_map),//K(global_page_state(NR_FILE_MAPPED)),
K(mem_shmem), //K(global_page_state(NR_SHMEM)),
(unsigned long)0, //K(global_page_state(NR_SLAB_RECLAIMABLE) +
//global_page_state(NR_SLAB_UNRECLAIMABLE)),
(unsigned long)0, //K(global_page_state(NR_SLAB_RECLAIMABLE)),
(unsigned long)0, //K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
(unsigned long)0, //global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024,
(unsigned long)0, //K(global_page_state(NR_PAGETABLE)),
#ifdef CONFIG_QUICKLIST
(unsigned long)0, //K(quicklist_total_size()),
#endif
(unsigned long)0, //K(global_page_state(NR_UNSTABLE_NFS)),
(unsigned long)0, //K(global_page_state(NR_BOUNCE)),
(unsigned long)0, //K(global_page_state(NR_WRITEBACK_TEMP)),
(unsigned long)0, //K(vm_commit_limit()),
(unsigned long)0, //K(committed),
(unsigned long)0, //(unsigned long)VMALLOC_TOTAL >> 10,
(unsigned long)0, //vmi.used >> 10,
(unsigned long)0, //vmi.largest_chunk >> 10
#ifdef CONFIG_MEMORY_FAILURE
(unsigned long)0, //,atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
K(mem_rss_huge)
#endif
);
//hugetlb_report_meminfo(m);
//arch_report_meminfo(m);
return 0;
}
int mem_cgroupfs_meminfo_show(struct seq_file *m, void *v)
{
int ret;
struct cgroup_subsys_state *css;
struct mem_cgroup *memcg;
css = cgroupfs_get_parent_role_cgroup(current,
CGROUPFS_CGROUP_ROLE_POD_GROUPS, memory_cgrp_id);
memcg = mem_cgroup_from_css(css);
ret = mem_cgroup_meminfo_read_comm(m, v, memcg);
css_put(css);
return ret;
}
static int mem_cgroup_meminfo_read(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
return mem_cgroup_meminfo_read_comm(m, v, memcg);
}
#define NR_VM_WRITEBACK_STAT_ITEMS 2
extern const char * const vmstat_text[];
extern unsigned int vmstat_text_size;
static int mem_cgroup_vmstat_read_comm(struct seq_file *m, void *vv, struct mem_cgroup *memcg)
{
unsigned long *v1, *v;
int i, stat_items_size;
u64 mem_limit, mem_usage;
mem_limit = memcg->memory.max;
if (mem_limit == PAGE_COUNTER_MAX)
mem_limit = totalram_pages() *PAGE_SIZE;
else
mem_limit = mem_limit * PAGE_SIZE;
mem_usage = (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
stat_items_size = vmstat_text_size * sizeof(unsigned long);
#ifdef CONFIG_VM_EVENT_COUNTERS
stat_items_size += sizeof(struct vm_event_state);
#endif
v1 = v = kzalloc(stat_items_size, GFP_KERNEL);
if (!v)
return -ENOMEM;
v[NR_FREE_PAGES] = (mem_limit - mem_usage) >> PAGE_SHIFT;
v[NR_ZONE_INACTIVE_ANON] = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
v[NR_ZONE_ACTIVE_ANON] = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
v[NR_ZONE_INACTIVE_FILE] = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
v[NR_ZONE_ACTIVE_FILE] = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
v[NR_ZONE_UNEVICTABLE] = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
v[NR_MLOCK] = 0;
#if 0
v[NR_ANON_PAGES] = v[NR_INACTIVE_ANON] + v[NR_ACTIVE_ANON];
v[NR_FILE_MAPPED] = memcg_page_state(memcg, memcg1_stats[4]);
v[NR_FILE_PAGES] = memcg_page_state(memcg, memcg1_stats[0]);
v[NR_FILE_DIRTY] = 0;
v[NR_WRITEBACK] = 0;
v[NR_SLAB_RECLAIMABLE] = 0;
v[NR_SLAB_UNRECLAIMABLE] = 0;
v[NR_PAGETABLE] = 0;
v[NR_KERNEL_STACK] = 0;
v[NR_UNSTABLE_NFS] = 0;
#endif
v += NR_VM_ZONE_STAT_ITEMS;
#if IS_ENABLED(CONFIG_ZSMALLOC)
v += 1;
#endif
v += NR_VM_NUMA_STAT_ITEMS;
v += NR_VM_NODE_STAT_ITEMS;
v += NR_VM_WRITEBACK_STAT_ITEMS;
#ifdef CONFIG_VM_EVENT_COUNTERS
//all_vm_events(v);
v[PGPGIN] = memcg_events_local(memcg, memcg1_events[0]) * (PAGE_SIZE / 1024); /* sectors -> kbytes */
v[PGPGOUT] = memcg_events_local(memcg, memcg1_events[1]) * (PAGE_SIZE / 1024);
#endif
for ( i = 0; i < vmstat_text_size; i++ )
{
seq_printf(m, "%s %lu\n", vmstat_text[i], v1[i]);
}
kfree(v1);
return 0;
}
#ifdef CONFIG_CGROUP_WRITEBACK
static ssize_t mem_cgroup_bind_blkio_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
struct cgroup_subsys_state *css;
struct path path;
char *pbuf;
int ret;
if (!buff_wb_enabled())
return -EPERM;
buf = strstrip(buf);
/* alloc memory outside mutex */
pbuf = kzalloc(PATH_MAX, GFP_KERNEL);
if (!pbuf)
return -ENOMEM;
strncpy(pbuf, buf, PATH_MAX-1);
mutex_lock(&memcg_max_mutex);
if (memcg->bind_blkio) {
WARN_ON(!memcg->bind_blkio_path);
kfree(memcg->bind_blkio_path);
memcg->bind_blkio_path = NULL;
css_put(memcg->bind_blkio);
memcg->bind_blkio = NULL;
wb_memcg_offline(memcg);
INIT_LIST_HEAD(&memcg->cgwb_list);
}
if (!strnlen(buf, PATH_MAX)) {
mutex_unlock(&memcg_max_mutex);
kfree(pbuf);
return nbytes;
}
ret = kern_path(pbuf, LOOKUP_FOLLOW, &path);
if (ret)
goto err;
css = css_tryget_online_from_dir(path.dentry, &io_cgrp_subsys);
if (IS_ERR(css)) {
ret = PTR_ERR(css);
path_put(&path);
goto err;
}
path_put(&path);
memcg->bind_blkio_path = pbuf;
memcg->bind_blkio = css;
mutex_unlock(&memcg_max_mutex);
return nbytes;
err:
if (pbuf)
kfree(pbuf);
mutex_unlock(&memcg_max_mutex);
return ret;
}
static int mem_cgroup_bind_blkio_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
if (memcg->bind_blkio_path)
seq_printf(m, "%s\n", memcg->bind_blkio_path);
return 0;
}
#endif
static ssize_t mem_cgroup_sync_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
if (mem_cgroup_is_root(memcg))
return -EINVAL;
cgroup_sync(memcg);
return nbytes;
}
#ifdef CONFIG_MEM_QOS
static int memory_async_reclaim_wmark_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
seq_printf(m, "%d\n", READ_ONCE(memcg->async_wmark));
return 0;
}
static ssize_t memory_async_reclaim_wmark_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
int ret, wmark;
buf = strstrip(buf);
if (!buf)
return -EINVAL;
ret = kstrtoint(buf, 0, &wmark);
if (ret)
return ret;
if (wmark > 100)
return -EINVAL;
xchg(&memcg->async_wmark, wmark);
setup_async_wmark(memcg);
if (need_memcg_async_reclaim(memcg))
queue_work(memcg_async_reclaim_wq, &memcg->async_work);
return nbytes;
}
static int memory_async_distance_factor_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
seq_printf(m, "%d\n", READ_ONCE(memcg->async_distance_factor));
return 0;
}
static ssize_t memory_async_distance_factor_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
int ret, factor;
buf = strstrip(buf);
if (!buf)
return -EINVAL;
ret = kstrtoint(buf, 0, &factor);
if (ret)
return ret;
if ((factor > 150000) || (factor < 1))
return -EINVAL;
xchg(&memcg->async_distance_factor, factor);
setup_async_wmark(memcg);
return nbytes;
}
#endif
#ifdef CONFIG_CGROUP_SLI
static int mem_cgroup_sli_max_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
struct cgroup *cgrp;
cgrp = memcg->css.cgroup;
return sli_memlat_max_show(m, cgrp);
}
static int mem_cgroup_sli_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
struct cgroup *cgrp;
cgrp = memcg->css.cgroup;
return sli_memlat_stat_show(m, cgrp);
}
#endif
extern unsigned int vm_memcg_latency_histogram;
extern unsigned int vm_memcg_page_cache_hit;
#ifdef CONFIG_MEM_QOS
static int mem_cgroup_lat_seq_show(struct seq_file *m, void *v)
{
u64 sum_lat;
int i, cpu;
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
if (!sysctl_vm_memory_qos) {
seq_printf(m, "vm.memory_qos is not enabled.\n");
return 0;
}
if (!vm_memcg_latency_histogram) {
seq_printf(m, "vm.memcg_latency_histogram is not enabled.\n");
return 0;
}
if (sysctl_vm_memory_qos) {
for (i = 0; i < MEM_LATENCY_MAX_SLOTS; i++) {
sum_lat = 0;
for_each_possible_cpu(cpu) {
sum_lat += *per_cpu_ptr(memcg->latency_histogram[i], cpu);
*per_cpu_ptr(memcg->latency_histogram[i], cpu) = 0;
}
if (i == 0)
seq_printf(m, "[%-20llu, %-20llu]ns : %llu.\n", (u64)0, (u64)1, sum_lat);
else
seq_printf(m, "[%-20llu, %-20llu]ns : %llu.\n", (u64)1 << (i - 1), (u64)1 << i, sum_lat);
}
}
return 0;
}
static int mem_cgroup_page_cache_hit_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
long long mpa = 0, mbd = 0, apcl = 0, apd = 0, total = 0, misses = 0, hits = 0;
int cpu;
if (!sysctl_vm_memory_qos) {
seq_printf(m, "vm.memory_qos is not enabled.\n");
return 0;
}
if (!vm_memcg_page_cache_hit) {
seq_printf(m, "vm.memcg_page_cache_hit is not enabled.\n");
return 0;
}
if (!memcg->mpa || !memcg->mbd || !memcg->apcl || !memcg->apd)
return 0;
for_each_possible_cpu(cpu) {
mpa += (long long)*per_cpu_ptr(memcg->mpa, cpu);
*per_cpu_ptr(memcg->mpa, cpu) = 0;
mbd += (long long)*per_cpu_ptr(memcg->mbd, cpu);
*per_cpu_ptr(memcg->mbd, cpu) = 0;
apcl += (long long)*per_cpu_ptr(memcg->apcl, cpu);
*per_cpu_ptr(memcg->apcl, cpu) = 0;
apd += (long long)*per_cpu_ptr(memcg->apd, cpu);
*per_cpu_ptr(memcg->apd, cpu) = 0;
}
total = mpa - mbd;
if (total < 0)
total = 0;
misses = apcl - apd;
if (misses < 0)
misses = 0;
hits = total - misses;
if (hits < 0) {
misses = total;
hits = 0;
}
seq_printf(m, "total: %llu, hits: %llu, misses: %llu.\n", total, hits, misses);
return 0;
}
#endif
static int memory_oom_group_show(struct seq_file *m, void *v);
static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off);
int mem_cgroupfs_vmstat_show(struct seq_file *m, void *v)
{
int ret;
struct cgroup_subsys_state *css;
struct mem_cgroup *memcg;
css = cgroupfs_get_parent_role_cgroup(current,
CGROUPFS_CGROUP_ROLE_POD_GROUPS, memory_cgrp_id);
memcg = mem_cgroup_from_css(css);
ret = mem_cgroup_vmstat_read_comm(m, v, memcg);
css_put(css);
return ret;
}
static int mem_cgroup_vmstat_read(struct seq_file *m, void *vv)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
return mem_cgroup_vmstat_read_comm(m, vv, memcg);
}
#ifdef CONFIG_EMM_MEMORY_RECLAIM
#ifdef CONFIG_LRU_GEN
extern int memcg_lru_gen_emm_run(struct mem_cgroup *memcg, int mode,
unsigned long nr_to_reclaim, unsigned long swappiness);
#else
static inline int memcg_lru_gen_emm_run(struct mem_cgroup *memcg, int mode,
unsigned long nr_to_reclaim, unsigned long swappiness)
{
return -ENOTSUPP;
}
#endif
extern int memcg_lru_emm_run(struct mem_cgroup *memcg, int mode,
unsigned long nr_to_reclaim, unsigned long swappiness);
static ssize_t mem_cgroup_emm_run(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
unsigned long nr_pages;
char *nr_reclaim_buf, *end;
int swappiness;
int mode;
int err;
if (!vm_emm) {
printk(KERN_WARNING "you should open vm.wujing_enable.\n");
return -EINVAL;
}
buf = strstrip(buf);
nr_reclaim_buf = strsep(&buf, " ");
if (!nr_reclaim_buf || !buf)
return -EINVAL;
swappiness = simple_strtoul(buf, &end, 0);
if (*end != '\0' || swappiness > 201)
return -EINVAL;
err = page_counter_memparse(nr_reclaim_buf, "max", &nr_pages);
if (err)
return err;
mode = of_cft(of)->private;
if (mode != EMM_AGE && mode != EMM_RECLAIM)
return -EINVAL;
if (lru_gen_enabled())
err = memcg_lru_gen_emm_run(memcg, mode, nr_pages, swappiness);
else
err = memcg_lru_emm_run(memcg, mode, nr_pages, swappiness);
if (err < 0)
return err;
return nbytes;
}
#ifdef CONFIG_LRU_GEN
extern int memcg_lru_gen_emm_show(struct seq_file *m, void *v);
#endif
#endif
static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off);
static u64 memory_current_read(struct cgroup_subsys_state *css,
struct cftype *cft);
static int memory_low_show(struct seq_file *m, void *v);
static ssize_t memory_low_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off);
static int memory_high_show(struct seq_file *m, void *v);
static ssize_t memory_high_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off);
static ssize_t memory_high_write_cgv1(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off);
static int memory_max_show(struct seq_file *m, void *v);
static ssize_t memory_max_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off);
static int memory_events_show(struct seq_file *m, void *v);
#ifdef CONFIG_XFORK
static u64 mem_cgroup_xfork_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
return memcg->kabi_reserved1;
}
static int mem_cgroup_xfork_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
if (val > 1)
return -EINVAL;
#ifdef CONFIG_ASYNC_FORK
if (memcg->async_fork)
return -EINVAL;
#endif
if (memcg->kabi_reserved1 == val)
return 0;
memcg->kabi_reserved1 = val;
return 0;
}
#endif
static struct cftype mem_cgroup_legacy_files[] = {
#ifdef CONFIG_MEM_QOS
{
.name = "latency_histogram",
.seq_show = mem_cgroup_lat_seq_show,
},
{
.name = "page_cache_hit",
.seq_show = mem_cgroup_page_cache_hit_show,
},
#endif
{
.name = "usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
.write = mem_cgroup_reset,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
.write = mem_cgroup_write,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "soft_limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
.write = mem_cgroup_write,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "failcnt",
.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
.write = mem_cgroup_reset,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "stat",
.seq_show = memcg_stat_show,
},
{
.name = "force_empty",
.write = mem_cgroup_force_empty_write,
},
{
.name = "use_hierarchy",
.write_u64 = mem_cgroup_hierarchy_write,
.read_u64 = mem_cgroup_hierarchy_read,
},
#ifdef CONFIG_MEM_QOS
{
.name = "pagecache.reclaim_ratio",
.flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = pagecache_reclaim_ratio_read,
.write = pagecache_reclaim_ratio_write,
},
{
.name = "pagecache.max_ratio",
.flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = memory_pagecache_max_read,
.write = memory_pagecache_max_write,
},
{
.name = "pagecache.current",
.flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = pagecache_current_read,
},
{
.name = "use_priority_oom",
.write_u64 = mem_cgroup_priority_oom_write,
.read_u64 = mem_cgroup_priority_oom_read,
},
{
.name = "oom.group",
.flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
.seq_show = memory_oom_group_show,
.write = memory_oom_group_write,
},
{
.name = "priority_wmark_ratio",
.flags = CFTYPE_NOT_ON_ROOT,
.write = memcg_wmark_ratio_write,
.seq_show = memcg_wmark_ratio_show,
},
{
.name = "async_ratio",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_async_reclaim_wmark_show,
.write = memory_async_reclaim_wmark_write,
},
{
.name = "async_high",
.flags = CFTYPE_NOT_ON_ROOT,
.private = MEMFILE_PRIVATE(_MEM, ASYNC_HIGH_LIMIT),
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "async_low",
.flags = CFTYPE_NOT_ON_ROOT,
.private = MEMFILE_PRIVATE(_MEM, ASYNC_LOW_LIMIT),
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "async_distance_factor",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_async_distance_factor_show,
.write = memory_async_distance_factor_write,
},
#endif
#ifdef CONFIG_MEM_SPEED_THROTTLE
{
.name = "alloc_bps",
.flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = mem_cgroup_mem_spd_lmt_read,
.write_u64 = mem_cgroup_mem_spd_lmt_write,
},
#endif
{
.name = "cgroup.event_control", /* XXX: for compat */
.write = memcg_write_event_control,
.flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
},
{
.name = "swappiness",
.read_u64 = mem_cgroup_swappiness_read,
.write_u64 = mem_cgroup_swappiness_write,
},
{
.name = "swappiness_traverse",
.read_u64 = mem_cgroup_swappiness_read,
.write_u64 = mem_cgroup_swappiness_traverse_write,
},
{
.name = "move_charge_at_immigrate",
.read_u64 = mem_cgroup_move_charge_read,
.write_u64 = mem_cgroup_move_charge_write,
},
{
.name = "oom_control",
.seq_show = mem_cgroup_oom_control_read,
.write_u64 = mem_cgroup_oom_control_write,
.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
},
{
.name = "pressure_level",
},
#ifdef CONFIG_NUMA
{
.name = "numa_stat",
.seq_show = memcg_numa_stat_show,
},
#endif
{
.name = "meminfo",
.seq_show = mem_cgroup_meminfo_read,
},
{
.name = "meminfo_recursive",
.write_u64 = memcg_meminfo_recursive_write,
.read_u64 = memcg_meminfo_recursive_read,
},
{
.name = "vmstat",
.seq_show = mem_cgroup_vmstat_read,
},
{
.name = "kmem.limit_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
.write = mem_cgroup_write,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.usage_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.failcnt",
.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
.write = mem_cgroup_reset,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
.write = mem_cgroup_reset,
.read_u64 = mem_cgroup_read_u64,
},
#if defined(CONFIG_MEMCG_KMEM) && \
(defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
{
.name = "kmem.slabinfo",
.seq_show = memcg_slab_show,
},
#endif
{
.name = "kmem.tcp.limit_in_bytes",
.private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
.write = mem_cgroup_write,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.tcp.usage_in_bytes",
.private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.tcp.failcnt",
.private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
.write = mem_cgroup_reset,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.tcp.max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
.write = mem_cgroup_reset,
.read_u64 = mem_cgroup_read_u64,
},
#ifdef CONFIG_CGROUP_WRITEBACK
{
.name = "bind_blkio",
.flags = CFTYPE_NOT_ON_ROOT,
.write = mem_cgroup_bind_blkio_write,
.seq_show = mem_cgroup_bind_blkio_show,
},
#endif
{
.name = "sync",
.flags = CFTYPE_NOT_ON_ROOT,
.write = mem_cgroup_sync_write,
},
#ifdef CONFIG_PSI
{
.name = "memory.pressure",
.flags = CFTYPE_NO_PREFIX,
.seq_show = cgroup_memory_pressure_show,
.write = cgroup_memory_pressure_write,
.poll = cgroup_pressure_poll,
.release = cgroup_pressure_release,
},
#endif
#ifdef CONFIG_CGROUP_SLI
{
.name = "sli",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = mem_cgroup_sli_show,
},
{
.name = "sli_max",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = mem_cgroup_sli_max_show,
},
#endif
#ifdef CONFIG_EMM_MEMORY_RECLAIM
{
.name = "emm.age",
.private = EMM_AGE,
.flags = CFTYPE_NS_DELEGATABLE,
.write = mem_cgroup_emm_run,
},
{
.name = "emm.reclaim",
.private = EMM_RECLAIM,
.flags = CFTYPE_NS_DELEGATABLE,
.write = mem_cgroup_emm_run,
},
#ifdef CONFIG_LRU_GEN
{
.name = "emm.lru_gen",
.seq_show = memcg_lru_gen_emm_show,
},
#endif
#endif
{
.name = "reclaim",
.flags = CFTYPE_NS_DELEGATABLE,
.write = memory_reclaim,
},
{
.name = "current",
.read_u64 = memory_current_read,
},
{
.name = "low",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_low_show,
.write = memory_low_write,
},
{
.name = "high",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_high_show,
.write = memory_high_write_cgv1,
},
{
.name = "max",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_max_show,
.write = memory_max_write,
},
{
.name = "events",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_events_show,
},
#ifdef CONFIG_ASYNC_FORK
{
.name = "async_fork",
.read_u64 = mem_cgroup_async_fork_read,
.write_u64 = mem_cgroup_async_fork_write,
},
#endif
#ifdef CONFIG_XFORK
{
.name = "allow_xfork",
.read_u64 = mem_cgroup_xfork_read,
.write_u64 = mem_cgroup_xfork_write,
},
#endif
{ }, /* terminate */
};
/*
* Private memory cgroup IDR
*
* Swap-out records and page cache shadow entries need to store memcg
* references in constrained space, so we maintain an ID space that is
* limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
* memory-controlled cgroups to 64k.
*
* However, there usually are many references to the oflline CSS after
* the cgroup has been destroyed, such as page cache or reclaimable
* slab objects, that don't need to hang on to the ID. We want to keep
* those dead CSS from occupying IDs, or we might quickly exhaust the
* relatively small ID space and prevent the creation of new cgroups
* even when there are much fewer than 64k cgroups - possibly none.
*
* Maintain a private 16-bit ID space for memcg, and allow the ID to
* be freed and recycled when it's no longer needed, which is usually
* when the CSS is offlined.
*
* The only exception to that are records of swapped out tmpfs/shmem
* pages that need to be attributed to live ancestors on swapin. But
* those references are manageable from userspace.
*/
static DEFINE_IDR(mem_cgroup_idr);
static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
{
if (memcg->id.id > 0) {
idr_remove(&mem_cgroup_idr, memcg->id.id);
memcg->id.id = 0;
}
}
static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
{
refcount_add(n, &memcg->id.ref);
}
static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
{
if (refcount_sub_and_test(n, &memcg->id.ref)) {
mem_cgroup_id_remove(memcg);
/* Memcg ID pins CSS */
css_put(&memcg->css);
}
}
static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
{
mem_cgroup_id_put_many(memcg, 1);
}
/**
* mem_cgroup_from_id - look up a memcg from a memcg id
* @id: the memcg id to look up
*
* Caller must hold rcu_read_lock().
*/
struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
{
WARN_ON_ONCE(!rcu_read_lock_held());
return idr_find(&mem_cgroup_idr, id);
}
static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
{
struct mem_cgroup_per_node *pn;
int tmp = node;
/*
* This routine is called against possible nodes.
* But it's BUG to call kmalloc() against offline node.
*
* TODO: this routine can waste much memory for nodes which will
* never be onlined. It's better to use memory hotplug callback
* function.
*/
if (!node_state(node, N_NORMAL_MEMORY))
tmp = -1;
pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
if (!pn)
return 1;
pn->lruvec_stat_local = alloc_percpu(struct lruvec_stat);
if (!pn->lruvec_stat_local) {
kfree(pn);
return 1;
}
pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
if (!pn->lruvec_stat_cpu) {
free_percpu(pn->lruvec_stat_local);
kfree(pn);
return 1;
}
lruvec_init(&pn->lruvec);
pn->usage_in_excess = 0;
pn->on_tree = false;
pn->memcg = memcg;
memcg->nodeinfo[node] = pn;
return 0;
}
static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
{
struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
if (!pn)
return;
free_percpu(pn->lruvec_stat_cpu);
free_percpu(pn->lruvec_stat_local);
kfree(pn);
}
static void __mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
#ifdef CONFIG_MEM_QOS
int i;
for (i = 0; i < MEM_LATENCY_MAX_SLOTS; i++)
free_percpu(memcg->latency_histogram[i]);
free_percpu(memcg->mpa);
free_percpu(memcg->mbd);
free_percpu(memcg->apcl);
free_percpu(memcg->apd);
#endif
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
free_percpu(memcg->vmstats_percpu);
free_percpu(memcg->vmstats_local);
kfree(memcg);
}
static void mem_cgroup_free(struct mem_cgroup *memcg)
{
lru_gen_exit_memcg(memcg);
memcg_wb_domain_exit(memcg);
/*
* Flush percpu vmstats and vmevents to guarantee the value correctness
* on parent's and all ancestor levels.
*/
memcg_flush_percpu_vmstats(memcg);
memcg_flush_percpu_vmevents(memcg);
__mem_cgroup_free(memcg);
}
static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *memcg;
unsigned int size;
int node;
int __maybe_unused i;
long error = -ENOMEM;
size = sizeof(struct mem_cgroup);
size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
memcg = kzalloc(size, GFP_KERNEL);
if (!memcg)
return ERR_PTR(error);
memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
1, MEM_CGROUP_ID_MAX,
GFP_KERNEL);
if (memcg->id.id < 0) {
error = memcg->id.id;
goto fail;
}
memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu);
if (!memcg->vmstats_local)
goto fail;
memcg->vmstats_percpu = alloc_percpu(struct memcg_vmstats_percpu);
if (!memcg->vmstats_percpu)
goto fail;
for_each_node(node)
if (alloc_mem_cgroup_per_node_info(memcg, node))
goto fail;
if (memcg_wb_domain_init(memcg, GFP_KERNEL))
goto fail;
INIT_WORK(&memcg->high_work, high_work_func);
#ifdef CONFIG_MEM_QOS
INIT_WORK(&memcg->async_work, async_reclaim_func);
#endif
memcg->last_scanned_node = MAX_NUMNODES;
INIT_LIST_HEAD(&memcg->oom_notify);
mutex_init(&memcg->thresholds_lock);
spin_lock_init(&memcg->move_lock);
vmpressure_init(&memcg->vmpressure);
INIT_LIST_HEAD(&memcg->event_list);
spin_lock_init(&memcg->event_list_lock);
memcg->socket_pressure = jiffies;
#ifdef CONFIG_MEMCG_KMEM
memcg->kmemcg_id = -1;
INIT_LIST_HEAD(&memcg->objcg_list);
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list);
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
memcg->cgwb_frn[i].done =
__WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
memcg->deferred_split_queue.split_queue_len = 0;
#endif
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
lru_gen_init_memcg(memcg);
return memcg;
fail:
mem_cgroup_id_remove(memcg);
__mem_cgroup_free(memcg);
return ERR_PTR(error);
}
static struct cgroup_subsys_state * __ref
mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
struct mem_cgroup *memcg;
long error = -ENOMEM;
#ifdef CONFIG_MEM_QOS
int index;
#endif
memcg = mem_cgroup_alloc();
if (IS_ERR(memcg))
return ERR_CAST(memcg);
WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX);
memcg->soft_limit = PAGE_COUNTER_MAX;
#ifdef CONFIG_MEM_QOS
memcg->mpa = alloc_percpu(u64);
if (!memcg->mpa)
goto fail;
memcg->mbd = alloc_percpu(u64);
if (!memcg->mbd) {
free_percpu(memcg->mpa);
goto fail;
}
memcg->apcl = alloc_percpu(u64);
if (!memcg->apcl) {
free_percpu(memcg->mpa);
free_percpu(memcg->mbd);
goto fail;
}
memcg->apd = alloc_percpu(u64);
if (!memcg->apd) {
free_percpu(memcg->mpa);
free_percpu(memcg->mbd);
free_percpu(memcg->apcl);
goto fail;
}
for (index = 0; index < MEM_LATENCY_MAX_SLOTS; index++) {
memcg->latency_histogram[index] = alloc_percpu(u64);
if (!memcg->latency_histogram[index])
goto fail;
}
memcg->max_exit_latency = 0;
memcg->pagecache_reclaim_ratio = DEFAULT_PAGE_RECLAIM_RATIO;
memcg->pagecache_max_ratio = PAGECACHE_MAX_RATIO_MAX;
#endif
#if defined(CONFIG_NEED_MEMCG_ZRAM)
memcg->zram_max = PAGE_COUNTER_MAX;
#endif
if (parent) {
memcg->swappiness = mem_cgroup_swappiness(parent);
memcg->oom_kill_disable = parent->oom_kill_disable;
#ifdef CONFIG_MEM_QOS
memcg->async_wmark = parent->async_wmark;
memcg->async_distance_factor = parent->async_distance_factor ?
: ASYNC_DISTANCE_DEF;
memcg->async_wmark_delta = parent->async_wmark_delta;
memcg->async_distance_delta = parent->async_distance_delta ?
: ASYNC_DISTANCE_DEF;
memcg->wmark_ratio = parent->wmark_ratio;
#endif
#ifdef CONFIG_NEED_MEMCG_ZRAM
memcg->zram_prio = parent->zram_prio;
#endif
#ifdef CONFIG_XFORK
memcg->kabi_reserved1 = parent->kabi_reserved1;
#endif
#ifdef CONFIG_ASYNC_FORK
memcg->async_fork = parent->async_fork;
#endif
}
if (!parent) {
page_counter_init(&memcg->memory, NULL);
page_counter_init(&memcg->swap, NULL);
page_counter_init(&memcg->kmem, NULL);
page_counter_init(&memcg->tcpmem, NULL);
} else if (parent->use_hierarchy) {
memcg->use_hierarchy = true;
page_counter_init(&memcg->memory, &parent->memory);
page_counter_init(&memcg->swap, &parent->swap);
page_counter_init(&memcg->kmem, &parent->kmem);
page_counter_init(&memcg->tcpmem, &parent->tcpmem);
#ifdef CONFIG_MEM_QOS
page_counter_init(&memcg->pagecache, &parent->pagecache);
#endif
} else {
page_counter_init(&memcg->memory, &root_mem_cgroup->memory);
page_counter_init(&memcg->swap, &root_mem_cgroup->swap);
page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem);
#ifdef CONFIG_MEM_QOS
page_counter_init(&memcg->pagecache, &root_mem_cgroup->pagecache);
#endif
/*
* Deeper hierachy with use_hierarchy == false doesn't make
* much sense so let cgroup subsystem know about this
* unfortunate state in our controller.
*/
if (parent != root_mem_cgroup)
memory_cgrp_subsys.broken_hierarchy = true;
}
#ifdef CONFIG_MEM_QOS
setup_async_wmark(memcg);
#endif
/* The following stuff does not apply to the root */
if (!parent) {
#ifdef CONFIG_MEM_QOS
memcg->async_wmark_delta = -1;
memcg->wmark_ratio = 0;
#endif
root_mem_cgroup = memcg;
return &memcg->css;
}
error = memcg_online_kmem(memcg);
if (error)
goto fail;
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_inc(&memcg_sockets_enabled_key);
#ifdef CONFIG_MEM_QOS
INIT_LIST_HEAD(&memcg->prio_list);
INIT_LIST_HEAD(&memcg->prio_list_async);
#endif
return &memcg->css;
fail:
mem_cgroup_id_remove(memcg);
mem_cgroup_free(memcg);
return ERR_PTR(error);
}
static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
/*
* A memcg must be visible for memcg_expand_shrinker_maps()
* by the time the maps are allocated. So, we allocate maps
* here, when for_each_mem_cgroup() can't skip it.
*/
if (memcg_alloc_shrinker_maps(memcg)) {
mem_cgroup_id_remove(memcg);
return -ENOMEM;
}
/* Online state pins memcg ID, memcg ID pins CSS */
refcount_set(&memcg->id.ref, 1);
css_get(css);
mem_cgroup_mst_has_lmt_init(memcg);
#ifdef CONFIG_MEM_QOS
async_reclaim_reset_factor(memcg, memcg_get_prio(memcg));
memcg_notify_prio_change(memcg, 0, memcg_get_prio(memcg));
#endif
return 0;
}
#ifdef CONFIG_MEM_QOS
atomic_long_t dying_memcgs_count;
void wakeup_kclean_dying_memcg(void)
{
if (!waitqueue_active(&kclean_dying_memcg_wq)) /* .. */
return;
wake_up_interruptible(&kclean_dying_memcg_wq);
}
void charge_dying_memcgs(struct mem_cgroup *memcg)
{
if (sysctl_vm_memory_qos == 0)
return;
if (sysctl_clean_dying_memcg_async == 0)
return;
if (sysctl_clean_dying_memcg_threshold == 0)
return;
if (atomic_long_read(&dying_memcgs_count) >=
sysctl_clean_dying_memcg_threshold) {
atomic_long_set(&dying_memcgs_count, 0);
wakeup_kclean_dying_memcg();
}
memcg->offline_times = jiffies;
atomic_long_add(1, &dying_memcgs_count);
}
#endif
static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_event *event, *tmp;
#ifdef CONFIG_MEM_QOS
charge_dying_memcgs(memcg);
/* XXX no direct number */
memcg_notify_prio_change(memcg, memcg_get_prio(memcg), 0);
#endif
/*
* Unregister events and notify userspace.
* Notify userspace about cgroup removing only after rmdir of cgroup
* directory to avoid race between userspace and kernelspace.
*/
spin_lock(&memcg->event_list_lock);
list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
list_del_init(&event->list);
schedule_work(&event->remove);
}
spin_unlock(&memcg->event_list_lock);
page_counter_set_min(&memcg->memory, 0);
page_counter_set_low(&memcg->memory, 0);
#ifdef CONFIG_MEM_QOS
page_counter_set_async_high(&memcg->memory, PAGE_COUNTER_MAX);
page_counter_set_async_low(&memcg->memory, PAGE_COUNTER_MAX);
#endif
memcg_offline_kmem(memcg);
wb_memcg_offline(memcg);
drain_all_stock(memcg);
mem_cgroup_id_put(memcg);
}
static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
invalidate_reclaim_iterators(memcg);
}
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
int __maybe_unused i;
#ifdef CONFIG_CGROUP_WRITEBACK
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
wb_wait_for_completion(&memcg->cgwb_frn[i].done);
#endif
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_dec(&memcg_sockets_enabled_key);
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
static_branch_dec(&memcg_sockets_enabled_key);
if (memcg->bind_blkio) {
WARN_ON(!memcg->bind_blkio_path);
kfree(memcg->bind_blkio_path);
css_put(memcg->bind_blkio);
}
vmpressure_cleanup(&memcg->vmpressure);
cancel_work_sync(&memcg->high_work);
#ifdef config_mem_qos
cancel_work_sync(&memcg->async_work);
#endif
mem_cgroup_remove_from_trees(memcg);
memcg_free_shrinker_maps(memcg);
memcg_free_kmem(memcg);
mem_cgroup_free(memcg);
}
/**
* mem_cgroup_css_reset - reset the states of a mem_cgroup
* @css: the target css
*
* Reset the states of the mem_cgroup associated with @css. This is
* invoked when the userland requests disabling on the default hierarchy
* but the memcg is pinned through dependency. The memcg should stop
* applying policies and should revert to the vanilla state as it may be
* made visible again.
*
* The current implementation only resets the essential configurations.
* This needs to be expanded to cover all the visible parts.
*/
static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
#ifdef CONFIG_MEM_QOS
page_counter_set_max(&memcg->pagecache, PAGE_COUNTER_MAX);
#endif
page_counter_set_min(&memcg->memory, 0);
page_counter_set_low(&memcg->memory, 0);
#ifdef CONFIG_MEM_QOS
page_counter_set_async_high(&memcg->memory, PAGE_COUNTER_MAX);
page_counter_set_async_low(&memcg->memory, PAGE_COUNTER_MAX);
#endif
WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX);
memcg->soft_limit = PAGE_COUNTER_MAX;
memcg_wb_domain_size_changed(memcg);
mem_cgroup_mst_msc_reset(memcg);
}
#ifdef CONFIG_MMU
/* Handlers for move charge at task migration. */
static int mem_cgroup_do_precharge(unsigned long count)
{
int ret;
/* Try a single bulk charge without reclaim first, kswapd may wake */
ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
if (!ret) {
mc.precharge += count;
return ret;
}
/* Try charges one by one with reclaim, but do not retry */
while (count--) {
ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
if (ret)
return ret;
mc.precharge++;
cond_resched();
}
return 0;
}
union mc_target {
struct page *page;
swp_entry_t ent;
};
enum mc_target_type {
MC_TARGET_NONE = 0,
MC_TARGET_PAGE,
MC_TARGET_SWAP,
MC_TARGET_DEVICE,
};
static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent)
{
struct page *page = vm_normal_page(vma, addr, ptent);
if (!page || !page_mapped(page))
return NULL;
if (PageAnon(page)) {
if (!(mc.flags & MOVE_ANON))
return NULL;
} else {
if (!(mc.flags & MOVE_FILE))
return NULL;
}
if (!get_page_unless_zero(page))
return NULL;
return page;
}
#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
pte_t ptent, swp_entry_t *entry)
{
struct page *page = NULL;
swp_entry_t ent = pte_to_swp_entry(ptent);
if (!(mc.flags & MOVE_ANON))
return NULL;
/*
* Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to
* a device and because they are not accessible by CPU they are store
* as special swap entry in the CPU page table.
*/
if (is_device_private_entry(ent)) {
page = device_private_entry_to_page(ent);
/*
* MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
* a refcount of 1 when free (unlike normal page)
*/
if (!page_ref_add_unless(page, 1, 1))
return NULL;
return page;
}
if (non_swap_entry(ent))
return NULL;
/*
* Because lookup_swap_cache() updates some statistics counter,
* we call find_get_page() with swapper_space directly.
*/
page = find_get_page(swap_address_space(ent), swp_offset(ent));
entry->val = ent.val;
return page;
}
#else
static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
pte_t ptent, swp_entry_t *entry)
{
return NULL;
}
#endif
static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, swp_entry_t *entry)
{
struct page *page = NULL;
struct address_space *mapping;
pgoff_t pgoff;
if (!vma->vm_file) /* anonymous vma */
return NULL;
if (!(mc.flags & MOVE_FILE))
return NULL;
mapping = vma->vm_file->f_mapping;
pgoff = linear_page_index(vma, addr);
/* page is moved even if it's not RSS of this task(page-faulted). */
#ifdef CONFIG_SWAP
/* shmem/tmpfs may report page out on swap: account for that too. */
if (shmem_mapping(mapping)) {
page = find_get_entry(mapping, pgoff);
if (xa_is_value(page)) {
swp_entry_t swp = radix_to_swp_entry(page);
*entry = swp;
page = find_get_page(swap_address_space(swp),
swp_offset(swp));
}
} else
page = find_get_page(mapping, pgoff);
#else
page = find_get_page(mapping, pgoff);
#endif
return page;
}
/**
* mem_cgroup_move_account - move account of the page
* @page: the page
* @compound: charge the page as compound or small page
* @from: mem_cgroup which the page is moved from.
* @to: mem_cgroup which the page is moved to. @from != @to.
*
* The caller must make sure the page is not on LRU (isolate_page() is useful.)
*
* This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
* from old cgroup.
*/
static int mem_cgroup_move_account(struct page *page,
bool compound,
struct mem_cgroup *from,
struct mem_cgroup *to)
{
struct lruvec *from_vec, *to_vec;
struct pglist_data *pgdat;
unsigned int nr_pages = compound ? thp_nr_pages(page) : 1;
int ret;
VM_BUG_ON(from == to);
VM_BUG_ON_PAGE(PageLRU(page), page);
VM_BUG_ON(compound && !PageTransHuge(page));
/*
* Prevent mem_cgroup_migrate() from looking at
* page->mem_cgroup of its source page while we change it.
*/
ret = -EBUSY;
if (!trylock_page(page))
goto out;
ret = -EINVAL;
if (page->mem_cgroup != from)
goto out_unlock;
pgdat = page_pgdat(page);
from_vec = mem_cgroup_lruvec(from, pgdat);
to_vec = mem_cgroup_lruvec(to, pgdat);
lock_page_memcg(page);
if (PageAnon(page)) {
if (page_mapped(page)) {
__mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
__mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
if (PageTransHuge(page)) {
__dec_lruvec_state(from_vec, NR_ANON_THPS);
__inc_lruvec_state(to_vec, NR_ANON_THPS);
}
}
} else {
__mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
__mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
if (PageSwapBacked(page)) {
__mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
__mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
}
if (page_mapped(page)) {
__mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
__mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
}
if (PageDirty(page)) {
struct address_space *mapping = page_mapping(page);
if (mapping_cap_account_dirty(mapping)) {
__mod_lruvec_state(from_vec, NR_FILE_DIRTY,
-nr_pages);
__mod_lruvec_state(to_vec, NR_FILE_DIRTY,
nr_pages);
}
}
}
if (PageWriteback(page)) {
__mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
__mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
}
/*
* All state has been migrated, let's switch to the new memcg.
*
* It is safe to change page->mem_cgroup here because the page
* is referenced, charged, isolated, and locked: we can't race
* with (un)charging, migration, LRU putback, or anything else
* that would rely on a stable page->mem_cgroup.
*
* Note that lock_page_memcg is a memcg lock, not a page lock,
* to save space. As soon as we switch page->mem_cgroup to a
* new memcg that isn't locked, the above state can change
* concurrently again. Make sure we're truly done with it.
*/
smp_mb();
css_get(&to->css);
css_put(&from->css);
page->mem_cgroup = to;
__unlock_page_memcg(from);
ret = 0;
local_irq_disable();
mem_cgroup_charge_statistics(to, nr_pages);
memcg_check_events(to, page);
mem_cgroup_charge_statistics(from, -nr_pages);
memcg_check_events(from, page);
local_irq_enable();
out_unlock:
unlock_page(page);
out:
return ret;
}
/**
* get_mctgt_type - get target type of moving charge
* @vma: the vma the pte to be checked belongs
* @addr: the address corresponding to the pte to be checked
* @ptent: the pte to be checked
* @target: the pointer the target page or swap ent will be stored(can be NULL)
*
* Returns
* 0(MC_TARGET_NONE): if the pte is not a target for move charge.
* 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
* move charge. if @target is not NULL, the page is stored in target->page
* with extra refcnt got(Callers should handle it).
* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
* target for charge migration. if @target is not NULL, the entry is stored
* in target->ent.
* 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
* (so ZONE_DEVICE page and thus not on the lru).
* For now we such page is charge like a regular page would be as for all
* intent and purposes it is just special memory taking the place of a
* regular page.
*
* See Documentations/vm/hmm.txt and include/linux/hmm.h
*
* Called with pte lock held.
*/
static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, union mc_target *target)
{
struct page *page = NULL;
enum mc_target_type ret = MC_TARGET_NONE;
swp_entry_t ent = { .val = 0 };
if (pte_present(ptent))
page = mc_handle_present_pte(vma, addr, ptent);
else if (is_swap_pte(ptent))
page = mc_handle_swap_pte(vma, ptent, &ent);
else if (pte_none(ptent))
page = mc_handle_file_pte(vma, addr, ptent, &ent);
if (!page && !ent.val)
return ret;
if (page) {
/*
* Do only loose check w/o serialization.
* mem_cgroup_move_account() checks the page is valid or
* not under LRU exclusion.
*/
if (page->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
if (is_device_private_page(page))
ret = MC_TARGET_DEVICE;
if (target)
target->page = page;
}
if (!ret || !target)
put_page(page);
}
/*
* There is a swap entry and a page doesn't exist or isn't charged.
* But we cannot move a tail-page in a THP.
*/
if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
ret = MC_TARGET_SWAP;
if (target)
target->ent = ent;
}
return ret;
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
* We don't consider PMD mapped swapping or file mapped pages because THP does
* not support them for now.
* Caller should make sure that pmd_trans_huge(pmd) is true.
*/
static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd, union mc_target *target)
{
struct page *page = NULL;
enum mc_target_type ret = MC_TARGET_NONE;
if (unlikely(is_swap_pmd(pmd))) {
VM_BUG_ON(thp_migration_supported() &&
!is_pmd_migration_entry(pmd));
return ret;
}
page = pmd_page(pmd);
VM_BUG_ON_PAGE(!page || !PageHead(page), page);
if (!(mc.flags & MOVE_ANON))
return ret;
if (page->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
if (target) {
get_page(page);
target->page = page;
}
}
return ret;
}
#else
static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd, union mc_target *target)
{
return MC_TARGET_NONE;
}
#endif
static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->vma;
pte_t *pte;
spinlock_t *ptl;
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
/*
* Note their can not be MC_TARGET_DEVICE for now as we do not
* support transparent huge page with MEMORY_DEVICE_PRIVATE but
* this might change.
*/
if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
mc.precharge += HPAGE_PMD_NR;
spin_unlock(ptl);
return 0;
}
if (pmd_trans_unstable(pmd))
return 0;
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE)
if (get_mctgt_type(vma, addr, *pte, NULL))
mc.precharge++; /* increment precharge temporarily */
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
return 0;
}
static const struct mm_walk_ops precharge_walk_ops = {
.pmd_entry = mem_cgroup_count_precharge_pte_range,
};
static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
{
unsigned long precharge;
down_read(&mm->mmap_sem);
walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
up_read(&mm->mmap_sem);
precharge = mc.precharge;
mc.precharge = 0;
return precharge;
}
static int mem_cgroup_precharge_mc(struct mm_struct *mm)
{
unsigned long precharge = mem_cgroup_count_precharge(mm);
VM_BUG_ON(mc.moving_task);
mc.moving_task = current;
return mem_cgroup_do_precharge(precharge);
}
/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
static void __mem_cgroup_clear_mc(void)
{
struct mem_cgroup *from = mc.from;
struct mem_cgroup *to = mc.to;
/* we must uncharge all the leftover precharges from mc.to */
if (mc.precharge) {
cancel_charge(mc.to, mc.precharge);
mc.precharge = 0;
}
/*
* we didn't uncharge from mc.from at mem_cgroup_move_account(), so
* we must uncharge here.
*/
if (mc.moved_charge) {
cancel_charge(mc.from, mc.moved_charge);
mc.moved_charge = 0;
}
/* we must fixup refcnts and charges */
if (mc.moved_swap) {
/* uncharge swap account from the old cgroup */
if (!mem_cgroup_is_root(mc.from))
page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
mem_cgroup_id_put_many(mc.from, mc.moved_swap);
/*
* we charged both to->memory and to->memsw, so we
* should uncharge to->memory.
*/
if (!mem_cgroup_is_root(mc.to))
page_counter_uncharge(&mc.to->memory, mc.moved_swap);
mc.moved_swap = 0;
}
memcg_oom_recover(from);
memcg_oom_recover(to);
wake_up_all(&mc.waitq);
}
static void mem_cgroup_clear_mc(void)
{
struct mm_struct *mm = mc.mm;
/*
* we must clear moving_task before waking up waiters at the end of
* task migration.
*/
mc.moving_task = NULL;
__mem_cgroup_clear_mc();
spin_lock(&mc.lock);
mc.from = NULL;
mc.to = NULL;
mc.mm = NULL;
spin_unlock(&mc.lock);
mmput(mm);
}
static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
struct mem_cgroup *from;
struct task_struct *leader, *p;
struct mm_struct *mm;
unsigned long move_flags;
int ret = 0;
/* charge immigration isn't supported on the default hierarchy */
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
return 0;
/*
* Multi-process migrations only happen on the default hierarchy
* where charge immigration is not used. Perform charge
* immigration if @tset contains a leader and whine if there are
* multiple.
*/
p = NULL;
cgroup_taskset_for_each_leader(leader, css, tset) {
WARN_ON_ONCE(p);
p = leader;
memcg = mem_cgroup_from_css(css);
}
if (!p)
return 0;
/*
* We are now commited to this value whatever it is. Changes in this
* tunable will only affect upcoming migrations, not the current one.
* So we need to save it, and keep it going.
*/
move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
if (!move_flags)
return 0;
from = mem_cgroup_from_task(p);
VM_BUG_ON(from == memcg);
mm = get_task_mm(p);
if (!mm)
return 0;
/* We move charges only when we move a owner of the mm */
if (mm->owner == p) {
VM_BUG_ON(mc.from);
VM_BUG_ON(mc.to);
VM_BUG_ON(mc.precharge);
VM_BUG_ON(mc.moved_charge);
VM_BUG_ON(mc.moved_swap);
spin_lock(&mc.lock);
mc.mm = mm;
mc.from = from;
mc.to = memcg;
mc.flags = move_flags;
spin_unlock(&mc.lock);
/* We set mc.moving_task later */
ret = mem_cgroup_precharge_mc(mm);
if (ret)
mem_cgroup_clear_mc();
} else {
mmput(mm);
}
return ret;
}
static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
{
if (mc.to)
mem_cgroup_clear_mc();
}
static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
int ret = 0;
struct vm_area_struct *vma = walk->vma;
pte_t *pte;
spinlock_t *ptl;
enum mc_target_type target_type;
union mc_target target;
struct page *page;
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
if (mc.precharge < HPAGE_PMD_NR) {
spin_unlock(ptl);
return 0;
}
target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
if (target_type == MC_TARGET_PAGE) {
page = target.page;
if (!isolate_lru_page(page)) {
if (!mem_cgroup_move_account(page, true,
mc.from, mc.to)) {
mc.precharge -= HPAGE_PMD_NR;
mc.moved_charge += HPAGE_PMD_NR;
}
putback_lru_page(page);
}
put_page(page);
} else if (target_type == MC_TARGET_DEVICE) {
page = target.page;
if (!mem_cgroup_move_account(page, true,
mc.from, mc.to)) {
mc.precharge -= HPAGE_PMD_NR;
mc.moved_charge += HPAGE_PMD_NR;
}
put_page(page);
}
spin_unlock(ptl);
return 0;
}
if (pmd_trans_unstable(pmd))
return 0;
retry:
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; addr += PAGE_SIZE) {
pte_t ptent = *(pte++);
bool device = false;
swp_entry_t ent;
if (!mc.precharge)
break;
switch (get_mctgt_type(vma, addr, ptent, &target)) {
case MC_TARGET_DEVICE:
device = true;
/* fall through */
case MC_TARGET_PAGE:
page = target.page;
/*
* We can have a part of the split pmd here. Moving it
* can be done but it would be too convoluted so simply
* ignore such a partial THP and keep it in original
* memcg. There should be somebody mapping the head.
*/
if (PageTransCompound(page))
goto put;
if (!device && isolate_lru_page(page))
goto put;
if (!mem_cgroup_move_account(page, false,
mc.from, mc.to)) {
mc.precharge--;
/* we uncharge from mc.from later. */
mc.moved_charge++;
}
if (!device)
putback_lru_page(page);
put: /* get_mctgt_type() gets the page */
put_page(page);
break;
case MC_TARGET_SWAP:
ent = target.ent;
if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
mc.precharge--;
mem_cgroup_id_get_many(mc.to, 1);
/* we fixup other refcnts and charges later. */
mc.moved_swap++;
}
break;
default:
break;
}
}
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
if (addr != end) {
/*
* We have consumed all precharges we got in can_attach().
* We try charge one by one, but don't do any additional
* charges to mc.to if we have failed in charge once in attach()
* phase.
*/
ret = mem_cgroup_do_precharge(1);
if (!ret)
goto retry;
}
return ret;
}
static const struct mm_walk_ops charge_walk_ops = {
.pmd_entry = mem_cgroup_move_charge_pte_range,
};
static void mem_cgroup_move_charge(void)
{
lru_add_drain_all();
/*
* Signal lock_page_memcg() to take the memcg's move_lock
* while we're moving its pages to another memcg. Then wait
* for already started RCU-only updates to finish.
*/
atomic_inc(&mc.from->moving_account);
synchronize_rcu();
retry:
if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) {
/*
* Someone who are holding the mmap_sem might be waiting in
* waitq. So we cancel all extra charges, wake up all waiters,
* and retry. Because we cancel precharges, we might not be able
* to move enough charges, but moving charge is a best-effort
* feature anyway, so it wouldn't be a big problem.
*/
__mem_cgroup_clear_mc();
cond_resched();
goto retry;
}
/*
* When we have consumed all precharges and failed in doing
* additional charge, the page walk just aborts.
*/
walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
NULL);
up_read(&mc.mm->mmap_sem);
atomic_dec(&mc.from->moving_account);
}
static void mem_cgroup_move_task(void)
{
if (mc.to) {
mem_cgroup_move_charge();
mem_cgroup_clear_mc();
}
}
#else /* !CONFIG_MMU */
static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
{
return 0;
}
static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
{
}
static void mem_cgroup_move_task(void)
{
}
#endif
/*
* Cgroup retains root cgroups across [un]mount cycles making it necessary
* to verify whether we're attached to the default hierarchy on each mount
* attempt.
*/
static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
{
/*
* use_hierarchy is forced on the default hierarchy. cgroup core
* guarantees that @root doesn't have any children, so turning it
* on for the root memcg is enough.
*/
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
root_mem_cgroup->use_hierarchy = true;
else
root_mem_cgroup->use_hierarchy = false;
}
#ifdef CONFIG_LRU_GEN
static void mem_cgroup_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct cgroup_subsys_state *css;
/* find the first leader if there is any */
cgroup_taskset_for_each_leader(task, css, tset)
break;
if (!task)
return;
task_lock(task);
if (task->mm && READ_ONCE(task->mm->owner) == task)
lru_gen_migrate_mm(task->mm);
task_unlock(task);
}
#else
static void mem_cgroup_attach(struct cgroup_taskset *tset)
{
}
#endif /* CONFIG_LRU_GEN */
static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
{
if (value == PAGE_COUNTER_MAX)
seq_puts(m, "max\n");
else
seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
return 0;
}
static u64 memory_current_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
}
static int memory_min_show(struct seq_file *m, void *v)
{
return seq_puts_memcg_tunable(m,
READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
}
static ssize_t memory_min_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
unsigned long min;
int err;
buf = strstrip(buf);
err = page_counter_memparse(buf, "max", &min);
if (err)
return err;
page_counter_set_min(&memcg->memory, min);
return nbytes;
}
static int memory_low_show(struct seq_file *m, void *v)
{
return seq_puts_memcg_tunable(m,
READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
}
static ssize_t memory_low_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
unsigned long low;
int err;
buf = strstrip(buf);
err = page_counter_memparse(buf, "max", &low);
if (err)
return err;
page_counter_set_low(&memcg->memory, low);
return nbytes;
}
static int memory_high_show(struct seq_file *m, void *v)
{
return seq_puts_memcg_tunable(m, READ_ONCE(mem_cgroup_from_seq(m)->high));
}
static ssize_t memory_high_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
unsigned long nr_pages;
unsigned long high;
int err;
buf = strstrip(buf);
err = page_counter_memparse(buf, "max", &high);
if (err)
return err;
WRITE_ONCE(memcg->high, high);
nr_pages = page_counter_read(&memcg->memory);
if (nr_pages > high)
try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
GFP_KERNEL, true);
#ifdef CONFIG_MEM_QOS
setup_async_wmark(memcg);
if (need_memcg_async_reclaim(memcg))
queue_work(memcg_async_reclaim_wq, &memcg->async_work);
#endif
memcg_wb_domain_size_changed(memcg);
return nbytes;
}
static ssize_t memory_high_write_cgv1(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
if (!vm_emm) {
printk(KERN_WARNING "you should open vm.wujing_enable.\n");
return -EINVAL;
}
return memory_high_write(of, buf, nbytes, off);
}
static int memory_max_show(struct seq_file *m, void *v)
{
return seq_puts_memcg_tunable(m,
READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
}
static ssize_t memory_max_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
bool drained = false;
unsigned long max;
int err;
buf = strstrip(buf);
err = page_counter_memparse(buf, "max", &max);
if (err)
return err;
xchg(&memcg->memory.max, max);
for (;;) {
unsigned long nr_pages = page_counter_read(&memcg->memory);
if (nr_pages <= max)
break;
if (signal_pending(current)) {
err = -EINTR;
break;
}
if (!drained) {
drain_all_stock(memcg);
drained = true;
continue;
}
if (nr_reclaims) {
if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
GFP_KERNEL, true))
nr_reclaims--;
continue;
}
memcg_memory_event(memcg, MEMCG_OOM);
if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
break;
}
#ifdef CONFIG_MEM_QOS
setup_async_wmark(memcg);
if (need_memcg_async_reclaim(memcg))
queue_work(memcg_async_reclaim_wq, &memcg->async_work);
pagecache_set_limit(memcg);
#endif
memcg_wb_domain_size_changed(memcg);
return nbytes;
}
static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
{
seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
seq_printf(m, "oom_kill %lu\n",
atomic_long_read(&events[MEMCG_OOM_KILL]));
#ifdef CONFIG_MEM_QOS
seq_printf(m, "pagecache_max:%lu\n", atomic_long_read(&events[MEMCG_PAGECACHE_MAX]));
seq_printf(m, "pagecache_oom:%lu\n", atomic_long_read(&events[MEMCG_PAGECACHE_OOM]));
#endif
}
static int memory_events_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
__memory_events_show(m, memcg->memory_events);
return 0;
}
static int memory_events_local_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
__memory_events_show(m, memcg->memory_events_local);
return 0;
}
static int memory_stat_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
char *buf;
buf = memory_stat_format(memcg);
if (!buf)
return -ENOMEM;
seq_puts(m, buf);
kfree(buf);
return 0;
}
static int memory_oom_group_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
seq_printf(m, "%d\n", memcg->oom_group);
return 0;
}
static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
int ret, oom_group;
buf = strstrip(buf);
if (!buf)
return -EINVAL;
ret = kstrtoint(buf, 0, &oom_group);
if (ret)
return ret;
if (oom_group != 0 && oom_group != 1)
return -EINVAL;
memcg->oom_group = oom_group;
return nbytes;
}
static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
unsigned int nr_retries = MAX_RECLAIM_RETRIES;
unsigned long nr_to_reclaim, nr_reclaimed = 0;
int err;
if (!vm_emm) {
printk(KERN_WARNING "you should open vm.wujing_enable.\n");
return -EINVAL;
}
buf = strstrip(buf);
err = page_counter_memparse(buf, "", &nr_to_reclaim);
if (err)
return err;
while (nr_reclaimed < nr_to_reclaim) {
unsigned long reclaimed;
if (signal_pending(current))
return -EINTR;
/*
* This is the final attempt, drain percpu lru caches in the
* hope of introducing more evictable pages for
* try_to_free_mem_cgroup_pages().
*/
if (!nr_retries)
lru_add_drain_all();
reclaimed = try_to_free_mem_cgroup_pages(memcg,
nr_to_reclaim - nr_reclaimed,
GFP_KERNEL, true);
if (!reclaimed && !nr_retries--)
return -EAGAIN;
nr_reclaimed += reclaimed;
}
return nbytes;
}
#ifdef CONFIG_MEM_QOS
static int memory_async_high_wmark_show(struct seq_file *m, void *v)
{
return seq_puts_memcg_tunable(m,
READ_ONCE(mem_cgroup_from_seq(m)->memory.async_high));
}
static int memory_async_low_wmark_show(struct seq_file *m, void *v)
{
return seq_puts_memcg_tunable(m,
READ_ONCE(mem_cgroup_from_seq(m)->memory.async_low));
}
static int memory_async_distance_delta_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
seq_printf(m, "%d\n", READ_ONCE(memcg->async_distance_delta));
return 0;
}
static ssize_t memory_async_distance_delta_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
int ret, delta;
buf = strstrip(buf);
if (!buf)
return -EINVAL;
ret = kstrtoint(buf, 0, &delta);
if (ret)
return ret;
if ((delta > 50) || (delta < 1))
return -EINVAL;
xchg(&memcg->async_distance_delta, delta);
async_reclaim_reset_factor(memcg, memcg_get_prio(memcg));
return nbytes;
}
static int memory_async_wmark_delta_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
seq_printf(m, "%d\n", READ_ONCE(memcg->async_wmark_delta));
return 0;
}
static ssize_t memory_async_wmark_delta_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
int ret, delta;
buf = strstrip(buf);
if (!buf)
return -EINVAL;
ret = kstrtoint(buf, 0, &delta);
if (ret)
return ret;
if (((delta > 10) || (delta < 1)) && (delta != -1))
return -EINVAL;
xchg(&memcg->async_wmark_delta, delta);
async_reclaim_reset_factor(memcg, memcg_get_prio(memcg));
return nbytes;
}
#endif
static struct cftype memory_files[] = {
#ifdef CONFIG_MEM_QOS
{
.name = "pagecache.reclaim_ratio",
.flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = pagecache_reclaim_ratio_read,
.write = pagecache_reclaim_ratio_write,
},
{
.name = "pagecache.max_ratio",
.flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = memory_pagecache_max_read,
.write = memory_pagecache_max_write,
},
{
.name = "pagecache.current",
.flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = pagecache_current_read,
},
#endif
{
.name = "current",
.flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = memory_current_read,
},
{
.name = "min",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_min_show,
.write = memory_min_write,
},
{
.name = "low",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_low_show,
.write = memory_low_write,
},
{
.name = "high",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_high_show,
.write = memory_high_write,
},
{
.name = "max",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_max_show,
.write = memory_max_write,
},
#ifdef CONFIG_MEM_QOS
{
.name = "use_priority_oom",
.write_u64 = mem_cgroup_priority_oom_write,
.read_u64 = mem_cgroup_priority_oom_read,
},
{
.name = "latency",
.read_u64 = mem_cgroup_latency_read,
.write_u64 = mem_cgroup_latency_write,
.flags = CFTYPE_NOT_ON_ROOT,
},
{
.name = "exit_latency_stat",
.flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
.seq_show = memory_exit_latency_stat_show,
},
{
.name = "priority_wmark_ratio",
.flags = CFTYPE_NOT_ON_ROOT,
.write = memcg_wmark_ratio_write,
.seq_show = memcg_wmark_ratio_show,
},
#endif
{
.name = "events",
.flags = CFTYPE_NOT_ON_ROOT,
.file_offset = offsetof(struct mem_cgroup, events_file),
.seq_show = memory_events_show,
},
{
.name = "events.local",
.flags = CFTYPE_NOT_ON_ROOT,
.file_offset = offsetof(struct mem_cgroup, events_local_file),
.seq_show = memory_events_local_show,
},
{
.name = "stat",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_stat_show,
},
{
.name = "oom.group",
.flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
.seq_show = memory_oom_group_show,
.write = memory_oom_group_write,
},
#ifdef CONFIG_MEM_SPEED_THROTTLE
{
.name = "alloc_bps",
.write_u64 = mem_cgroup_mem_spd_lmt_write,
.read_u64 = mem_cgroup_mem_spd_lmt_read,
.flags = CFTYPE_NOT_ON_ROOT,
},
#endif
#ifdef CONFIG_MEM_QOS
{
.name = "async_ratio",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_async_reclaim_wmark_show,
.write = memory_async_reclaim_wmark_write,
},
{
.name = "async_high",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_async_high_wmark_show,
},
{
.name = "async_low",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_async_low_wmark_show,
},
{
.name = "async_distance_factor",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_async_distance_factor_show,
.write = memory_async_distance_factor_write,
},
{
.name = "async_ratio_delta",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_async_wmark_delta_show,
.write = memory_async_wmark_delta_write,
},
{
.name = "async_distance_delta",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_async_distance_delta_show,
.write = memory_async_distance_delta_write,
},
#endif
{
.name = "sync",
.flags = CFTYPE_NOT_ON_ROOT,
.write = mem_cgroup_sync_write,
},
{
.name = "reclaim",
.flags = CFTYPE_NS_DELEGATABLE,
.write = memory_reclaim,
},
#ifdef CONFIG_EMM_MEMORY_RECLAIM
{
.name = "emm.age",
.private = EMM_AGE,
.flags = CFTYPE_NS_DELEGATABLE,
.write = mem_cgroup_emm_run,
},
{
.name = "emm.reclaim",
.private = EMM_RECLAIM,
.flags = CFTYPE_NS_DELEGATABLE,
.write = mem_cgroup_emm_run,
},
#ifdef CONFIG_LRU_GEN
{
.name = "emm.lru_gen",
.seq_show = memcg_lru_gen_emm_show,
},
#endif
#endif
#ifdef CONFIG_ASYNC_FORK
{
.name = "async_fork",
.read_u64 = mem_cgroup_async_fork_read,
.write_u64 = mem_cgroup_async_fork_write,
},
#endif
#ifdef CONFIG_XFORK
{
.name = "allow_xfork",
.read_u64 = mem_cgroup_xfork_read,
.write_u64 = mem_cgroup_xfork_write,
},
#endif
{ } /* terminate */
};
struct cgroup_subsys memory_cgrp_subsys = {
.css_alloc = mem_cgroup_css_alloc,
.css_online = mem_cgroup_css_online,
.css_offline = mem_cgroup_css_offline,
.css_released = mem_cgroup_css_released,
.css_free = mem_cgroup_css_free,
.css_reset = mem_cgroup_css_reset,
.can_attach = mem_cgroup_can_attach,
.attach = mem_cgroup_attach,
.cancel_attach = mem_cgroup_cancel_attach,
.post_attach = mem_cgroup_move_task,
.bind = mem_cgroup_bind,
#ifdef CONFIG_MEM_QOS
.css_priority_change = mem_cgroup_notify_prio_change,
#endif
.dfl_cftypes = memory_files,
.legacy_cftypes = mem_cgroup_legacy_files,
.early_init = 0,
};
/**
* mem_cgroup_protected - check if memory consumption is in the normal range
* @root: the top ancestor of the sub-tree being checked
* @memcg: the memory cgroup to check
*
* WARNING: This function is not stateless! It can only be used as part
* of a top-down tree iteration, not for isolated queries.
*
* Returns one of the following:
* MEMCG_PROT_NONE: cgroup memory is not protected
* MEMCG_PROT_LOW: cgroup memory is protected as long there is
* an unprotected supply of reclaimable memory from other cgroups.
* MEMCG_PROT_MIN: cgroup memory is protected
*
* @root is exclusive; it is never protected when looked at directly
*
* To provide a proper hierarchical behavior, effective memory.min/low values
* are used. Below is the description of how effective memory.low is calculated.
* Effective memory.min values is calculated in the same way.
*
* Effective memory.low is always equal or less than the original memory.low.
* If there is no memory.low overcommittment (which is always true for
* top-level memory cgroups), these two values are equal.
* Otherwise, it's a part of parent's effective memory.low,
* calculated as a cgroup's memory.low usage divided by sum of sibling's
* memory.low usages, where memory.low usage is the size of actually
* protected memory.
*
* low_usage
* elow = min( memory.low, parent->elow * ------------------ ),
* siblings_low_usage
*
* | memory.current, if memory.current < memory.low
* low_usage = |
* | 0, otherwise.
*
*
* Such definition of the effective memory.low provides the expected
* hierarchical behavior: parent's memory.low value is limiting
* children, unprotected memory is reclaimed first and cgroups,
* which are not using their guarantee do not affect actual memory
* distribution.
*
* For example, if there are memcgs A, A/B, A/C, A/D and A/E:
*
* A A/memory.low = 2G, A/memory.current = 6G
* //\\
* BC DE B/memory.low = 3G B/memory.current = 2G
* C/memory.low = 1G C/memory.current = 2G
* D/memory.low = 0 D/memory.current = 2G
* E/memory.low = 10G E/memory.current = 0
*
* and the memory pressure is applied, the following memory distribution
* is expected (approximately):
*
* A/memory.current = 2G
*
* B/memory.current = 1.3G
* C/memory.current = 0.6G
* D/memory.current = 0
* E/memory.current = 0
*
* These calculations require constant tracking of the actual low usages
* (see propagate_protected_usage()), as well as recursive calculation of
* effective memory.low values. But as we do call mem_cgroup_protected()
* path for each memory cgroup top-down from the reclaim,
* it's possible to optimize this part, and save calculated elow
* for next usage. This part is intentionally racy, but it's ok,
* as memory.low is a best-effort mechanism.
*/
enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
struct mem_cgroup *memcg)
{
struct mem_cgroup *parent;
unsigned long emin, parent_emin;
unsigned long elow, parent_elow;
unsigned long usage;
if (mem_cgroup_disabled())
return MEMCG_PROT_NONE;
if (!root)
root = root_mem_cgroup;
/*
* Effective values of the reclaim targets are ignored so they
* can be stale. Have a look at mem_cgroup_protection for more
* details.
* TODO: calculation should be more robust so that we do not need
* that special casing.
*/
if (memcg == root)
return MEMCG_PROT_NONE;
usage = page_counter_read(&memcg->memory);
if (!usage)
return MEMCG_PROT_NONE;
emin = memcg->memory.min;
elow = memcg->memory.low;
parent = parent_mem_cgroup(memcg);
/* No parent means a non-hierarchical mode on v1 memcg */
if (!parent)
return MEMCG_PROT_NONE;
if (parent == root)
goto exit;
parent_emin = READ_ONCE(parent->memory.emin);
emin = min(emin, parent_emin);
if (emin && parent_emin) {
unsigned long min_usage, siblings_min_usage;
min_usage = min(usage, memcg->memory.min);
siblings_min_usage = atomic_long_read(
&parent->memory.children_min_usage);
if (min_usage && siblings_min_usage)
emin = min(emin, parent_emin * min_usage /
siblings_min_usage);
}
parent_elow = READ_ONCE(parent->memory.elow);
elow = min(elow, parent_elow);
if (elow && parent_elow) {
unsigned long low_usage, siblings_low_usage;
low_usage = min(usage, memcg->memory.low);
siblings_low_usage = atomic_long_read(
&parent->memory.children_low_usage);
if (low_usage && siblings_low_usage)
elow = min(elow, parent_elow * low_usage /
siblings_low_usage);
}
exit:
memcg->memory.emin = emin;
memcg->memory.elow = elow;
if (usage <= emin)
return MEMCG_PROT_MIN;
else if (usage <= elow)
return MEMCG_PROT_LOW;
else
return MEMCG_PROT_NONE;
}
/**
* mem_cgroup_charge - charge a newly allocated page to a cgroup
* @page: page to charge
* @mm: mm context of the victim
* @gfp_mask: reclaim mode
*
* Try to charge @page to the memcg that @mm belongs to, reclaiming
* pages according to @gfp_mask if necessary.
*
* Returns 0 on success. Otherwise, an error code is returned.
*/
int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
{
unsigned int nr_pages = thp_nr_pages(page);
struct mem_cgroup *memcg = NULL;
int ret = 0;
if (mem_cgroup_disabled())
goto out;
if (PageSwapCache(page)) {
swp_entry_t ent = { .val = page_private(page), };
unsigned short id;
/*
* Every swap fault against a single page tries to charge the
* page, bail as early as possible. shmem_unuse() encounters
* already charged pages, too. page->mem_cgroup is protected
* by the page lock, which serializes swap cache removal, which
* in turn serializes uncharging.
*/
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (compound_head(page)->mem_cgroup)
goto out;
id = lookup_swap_cgroup_id(ent);
rcu_read_lock();
memcg = mem_cgroup_from_id(id);
if (memcg && !css_tryget_online(&memcg->css))
memcg = NULL;
rcu_read_unlock();
}
if (!memcg)
memcg = get_mem_cgroup_from_mm(mm);
ret = try_charge(memcg, gfp_mask, nr_pages);
if (ret)
goto out_put;
css_get(&memcg->css);
commit_charge(page, memcg);
local_irq_disable();
mem_cgroup_charge_statistics(memcg, nr_pages);
memcg_check_events(memcg, page);
local_irq_enable();
/*
* Cgroup1's unified memory+swap counter has been charged with the
* new swapcache page, finish the transfer by uncharging the swap
* slot. The swap slot would also get uncharged when it dies, but
* it can stick around indefinitely and we'd count the page twice
* the entire time.
*
* Cgroup2 has separate resource counters for memory and swap,
* so this is a non-issue here. Memory and swap charge lifetimes
* correspond 1:1 to page and swap slot lifetimes: we charge the
* page to memory here, and uncharge swap when the slot is freed.
*/
if (do_memsw_account() && PageSwapCache(page)) {
swp_entry_t entry = { .val = page_private(page) };
/*
* The swap entry might not get freed for a long time,
* let's not wait for it. The page already received a
* memory+swap charge, drop the swap entry duplicate.
*/
mem_cgroup_uncharge_swap(entry, nr_pages);
}
out_put:
css_put(&memcg->css);
out:
return ret;
}
struct uncharge_gather {
struct mem_cgroup *memcg;
unsigned long nr_pages;
unsigned long pgpgout;
unsigned long nr_kmem;
struct page *dummy_page;
};
static inline void uncharge_gather_clear(struct uncharge_gather *ug)
{
memset(ug, 0, sizeof(*ug));
}
static void uncharge_batch(const struct uncharge_gather *ug)
{
unsigned long flags;
if (!mem_cgroup_is_root(ug->memcg)) {
page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
if (do_memsw_account())
page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
memcg_oom_recover(ug->memcg);
}
local_irq_save(flags);
__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
__this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
memcg_check_events(ug->memcg, ug->dummy_page);
local_irq_restore(flags);
/* drop reference from uncharge_page */
css_put(&ug->memcg->css);
}
static void uncharge_page(struct page *page, struct uncharge_gather *ug)
{
unsigned long nr_pages;
VM_BUG_ON_PAGE(PageLRU(page), page);
if (!page->mem_cgroup)
return;
/*
* Nobody should be changing or seriously looking at
* page->mem_cgroup at this point, we have fully
* exclusive access to the page.
*/
if (ug->memcg != page->mem_cgroup) {
if (ug->memcg) {
uncharge_batch(ug);
uncharge_gather_clear(ug);
}
ug->memcg = page->mem_cgroup;
/* pairs with css_put in uncharge_batch */
css_get(&ug->memcg->css);
}
nr_pages = compound_nr(page);
ug->nr_pages += nr_pages;
if (!PageKmemcg(page)) {
ug->pgpgout++;
} else {
ug->nr_kmem += nr_pages;
__ClearPageKmemcg(page);
}
ug->dummy_page = page;
page->mem_cgroup = NULL;
css_put(&ug->memcg->css);
}
static void uncharge_list(struct list_head *page_list)
{
struct uncharge_gather ug;
struct list_head *next;
uncharge_gather_clear(&ug);
/*
* Note that the list can be a single page->lru; hence the
* do-while loop instead of a simple list_for_each_entry().
*/
next = page_list->next;
do {
struct page *page;
page = list_entry(next, struct page, lru);
next = page->lru.next;
uncharge_page(page, &ug);
} while (next != page_list);
if (ug.memcg)
uncharge_batch(&ug);
}
/**
* mem_cgroup_uncharge - uncharge a page
* @page: page to uncharge
*
* Uncharge a page previously charged with mem_cgroup_charge().
*/
void mem_cgroup_uncharge(struct page *page)
{
struct uncharge_gather ug;
if (mem_cgroup_disabled())
return;
/* Don't touch page->lru of any random page, pre-check: */
if (!page->mem_cgroup)
return;
uncharge_gather_clear(&ug);
uncharge_page(page, &ug);
uncharge_batch(&ug);
}
/**
* mem_cgroup_uncharge_list - uncharge a list of page
* @page_list: list of pages to uncharge
*
* Uncharge a list of pages previously charged with
* mem_cgroup_charge().
*/
void mem_cgroup_uncharge_list(struct list_head *page_list)
{
if (mem_cgroup_disabled())
return;
if (!list_empty(page_list))
uncharge_list(page_list);
}
/**
* mem_cgroup_migrate - charge a page's replacement
* @oldpage: currently circulating page
* @newpage: replacement page
*
* Charge @newpage as a replacement page for @oldpage. @oldpage will
* be uncharged upon free.
*
* Both pages must be locked, @newpage->mapping must be set up.
*/
void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
{
struct mem_cgroup *memcg;
unsigned int nr_pages;
unsigned long flags;
VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
newpage);
if (mem_cgroup_disabled())
return;
/* Page cache replacement: new page already charged? */
if (newpage->mem_cgroup)
return;
memcg = page_memcg(oldpage);
VM_WARN_ON_ONCE_PAGE(!memcg, oldpage);
if (!memcg)
return;
/* Force-charge the new page. The old one will be freed soon */
nr_pages = thp_nr_pages(newpage);
page_counter_charge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_charge(&memcg->memsw, nr_pages);
css_get(&memcg->css);
commit_charge(newpage, memcg);
local_irq_save(flags);
mem_cgroup_charge_statistics(memcg, nr_pages);
memcg_check_events(memcg, newpage);
local_irq_restore(flags);
}
DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
EXPORT_SYMBOL(memcg_sockets_enabled_key);
void mem_cgroup_sk_alloc(struct sock *sk)
{
struct mem_cgroup *memcg;
if (!mem_cgroup_sockets_enabled)
return;
/* Do not associate the sock with unrelated interrupted task's memcg. */
if (in_interrupt())
return;
rcu_read_lock();
memcg = mem_cgroup_from_task(current);
if (memcg == root_mem_cgroup)
goto out;
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
goto out;
if (css_tryget_online(&memcg->css))
sk->sk_memcg = memcg;
out:
rcu_read_unlock();
}
void mem_cgroup_sk_free(struct sock *sk)
{
if (sk->sk_memcg)
css_put(&sk->sk_memcg->css);
}
/**
* mem_cgroup_charge_skmem - charge socket memory
* @memcg: memcg to charge
* @nr_pages: number of pages to charge
*
* Charges @nr_pages to @memcg. Returns %true if the charge fit within
* @memcg's configured limit, %false if the charge had to be forced.
*/
bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
{
gfp_t gfp_mask = GFP_KERNEL;
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
struct page_counter *fail;
if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
memcg->tcpmem_pressure = 0;
return true;
}
page_counter_charge(&memcg->tcpmem, nr_pages);
memcg->tcpmem_pressure = 1;
return false;
}
/* Don't block in the packet receive path */
if (in_softirq())
gfp_mask = GFP_NOWAIT;
mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
if (try_charge(memcg, gfp_mask, nr_pages) == 0)
return true;
try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages);
return false;
}
/**
* mem_cgroup_uncharge_skmem - uncharge socket memory
* @memcg: memcg to uncharge
* @nr_pages: number of pages to uncharge
*/
void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
{
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
page_counter_uncharge(&memcg->tcpmem, nr_pages);
return;
}
mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
refill_stock(memcg, nr_pages);
}
static int __init cgroup_memory(char *s)
{
char *token;
while ((token = strsep(&s, ",")) != NULL) {
if (!*token)
continue;
if (!strcmp(token, "nosocket"))
cgroup_memory_nosocket = true;
if (!strcmp(token, "nokmem"))
cgroup_memory_nokmem = true;
if (!strcmp(token, "kmem"))
cgroup_memory_nokmem = false;
}
return 1;
}
__setup("cgroup.memory=", cgroup_memory);
#define DEFAULT_SPAN_PERCENT 10
#define MAX_SPAN_SIZE (10ull * SZ_1G)
#define MIN_SPAN_SIZE (2ull * SZ_1G)
/*
* subsys_initcall() for memory controller.
*
* Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
* context because of lock dependencies (cgroup_lock -> cpu hotplug) but
* basically everything that doesn't depend on a specific mem_cgroup structure
* should be initialized from here.
*/
static int __init mem_cgroup_init(void)
{
int cpu, node;
#ifdef CONFIG_MEM_QOS
memcg_async_reclaim_wq = alloc_workqueue("memcg_async_reclaim",
WQ_MEM_RECLAIM | WQ_UNBOUND | WQ_FREEZABLE,
WQ_UNBOUND_MAX_ACTIVE);
if (!memcg_async_reclaim_wq)
return -ENOMEM;
#endif
cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
memcg_hotplug_cpu_dead);
for_each_possible_cpu(cpu)
INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
drain_local_stock);
for_each_node(node) {
struct mem_cgroup_tree_per_node *rtpn;
rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
node_online(node) ? node : NUMA_NO_NODE);
rtpn->rb_root = RB_ROOT;
rtpn->rb_rightmost = NULL;
spin_lock_init(&rtpn->lock);
soft_limit_tree.rb_tree_per_node[node] = rtpn;
}
#ifdef CONFIG_MEM_QOS
{
int i;
memcg_prio_reclaimd_run();
for (i = 0; i< CGROUP_PRIORITY_MAX; i++) {
INIT_LIST_HEAD(&memcg_prios[i].head);
spin_lock_init(&memcg_prios[i].lock);
}
INIT_LIST_HEAD(&memcg_global_reclaim_list.list);
mutex_init(&memcg_global_reclaim_list.mutex);
}
#endif
return 0;
}
subsys_initcall(mem_cgroup_init);
#ifdef CONFIG_MEMCG_SWAP
static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
{
while (!refcount_inc_not_zero(&memcg->id.ref)) {
/*
* The root cgroup cannot be destroyed, so it's refcount must
* always be >= 1.
*/
if (WARN_ON_ONCE(memcg == root_mem_cgroup)) {
VM_BUG_ON(1);
break;
}
memcg = parent_mem_cgroup(memcg);
if (!memcg)
memcg = root_mem_cgroup;
}
return memcg;
}
/**
* mem_cgroup_swapout - transfer a memsw charge to swap
* @page: page whose memsw charge to transfer
* @entry: swap entry to move the charge to
*
* Transfer the memsw charge of @page to @entry.
*/
void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
{
struct mem_cgroup *memcg, *swap_memcg;
unsigned int nr_entries;
unsigned short oldid;
VM_BUG_ON_PAGE(PageLRU(page), page);
VM_BUG_ON_PAGE(page_count(page), page);
if (mem_cgroup_disabled())
return;
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
return;
memcg = page->mem_cgroup;
VM_WARN_ON_ONCE_PAGE(!memcg, page);
if (!memcg)
return;
/*
* In case the memcg owning these pages has been offlined and doesn't
* have an ID allocated to it anymore, charge the closest online
* ancestor for the swap instead and transfer the memory+swap charge.
*/
swap_memcg = mem_cgroup_id_get_online(memcg);
nr_entries = thp_nr_pages(page);
/* Get references for the tail pages, too */
if (nr_entries > 1)
mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
nr_entries);
VM_BUG_ON_PAGE(oldid, page);
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
page->mem_cgroup = NULL;
if (!mem_cgroup_is_root(memcg))
page_counter_uncharge(&memcg->memory, nr_entries);
if (!cgroup_memory_noswap && memcg != swap_memcg) {
if (!mem_cgroup_is_root(swap_memcg))
page_counter_charge(&swap_memcg->memsw, nr_entries);
page_counter_uncharge(&memcg->memsw, nr_entries);
}
/*
* Interrupts should be disabled here because the caller holds the
* i_pages lock which is taken with interrupts-off. It is
* important here to have the interrupts disabled because it is the
* only synchronisation we have for updating the per-CPU variables.
*/
VM_BUG_ON(!irqs_disabled());
mem_cgroup_charge_statistics(memcg, -nr_entries);
memcg_check_events(memcg, page);
css_put(&memcg->css);
}
/**
* mem_cgroup_try_charge_swap - try charging swap space for a page
* @page: page being added to swap
* @entry: swap entry to charge
*
* Try to charge @page's memcg for the swap space at @entry.
*
* Returns 0 on success, -ENOMEM on failure.
*/
int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
{
unsigned int nr_pages = thp_nr_pages(page);
struct page_counter *counter;
struct mem_cgroup *memcg;
unsigned short oldid;
if (mem_cgroup_disabled())
return 0;
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return 0;
memcg = page->mem_cgroup;
VM_WARN_ON_ONCE_PAGE(!memcg, page);
if (!memcg)
return 0;
if (!entry.val) {
memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
return 0;
}
memcg = mem_cgroup_id_get_online(memcg);
if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) &&
!page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
memcg_memory_event(memcg, MEMCG_SWAP_MAX);
memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
mem_cgroup_id_put(memcg);
return -ENOMEM;
}
/* Get references for the tail pages, too */
if (nr_pages > 1)
mem_cgroup_id_get_many(memcg, nr_pages - 1);
oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
VM_BUG_ON_PAGE(oldid, page);
mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
return 0;
}
/**
* mem_cgroup_uncharge_swap - uncharge swap space
* @entry: swap entry to uncharge
* @nr_pages: the amount of swap space to uncharge
*/
void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
{
struct mem_cgroup *memcg;
unsigned short id;
if (mem_cgroup_disabled())
return;
id = swap_cgroup_record(entry, 0, nr_pages);
rcu_read_lock();
memcg = mem_cgroup_from_id(id);
if (memcg) {
if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) {
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
page_counter_uncharge(&memcg->swap, nr_pages);
else
page_counter_uncharge(&memcg->memsw, nr_pages);
}
mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
mem_cgroup_id_put_many(memcg, nr_pages);
}
rcu_read_unlock();
}
long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
{
long nr_swap_pages = get_nr_swap_pages();
if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
return nr_swap_pages;
for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
nr_swap_pages = min_t(long, nr_swap_pages,
READ_ONCE(memcg->swap.max) -
page_counter_read(&memcg->swap));
return nr_swap_pages;
}
bool mem_cgroup_swap_full(struct page *page)
{
struct mem_cgroup *memcg;
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (vm_swap_full())
return true;
if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
return false;
memcg = page->mem_cgroup;
if (!memcg)
return false;
for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
if (page_counter_read(&memcg->swap) * 2 >=
READ_ONCE(memcg->swap.max))
return true;
return false;
}
static int __init setup_swap_account(char *s)
{
if (!strcmp(s, "1"))
cgroup_memory_noswap = 0;
else if (!strcmp(s, "0"))
cgroup_memory_noswap = 1;
return 1;
}
__setup("swapaccount=", setup_swap_account);
static u64 swap_current_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
}
static int swap_max_show(struct seq_file *m, void *v)
{
return seq_puts_memcg_tunable(m,
READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
}
static ssize_t swap_max_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
unsigned long max;
int err;
buf = strstrip(buf);
err = page_counter_memparse(buf, "max", &max);
if (err)
return err;
xchg(&memcg->swap.max, max);
return nbytes;
}
static int swap_events_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
seq_printf(m, "max %lu\n",
atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
seq_printf(m, "fail %lu\n",
atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL]));
return 0;
}
static struct cftype swap_files[] = {
{
.name = "swap.current",
.flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = swap_current_read,
},
{
.name = "swap.max",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = swap_max_show,
.write = swap_max_write,
},
{
.name = "swap.events",
.flags = CFTYPE_NOT_ON_ROOT,
.file_offset = offsetof(struct mem_cgroup, swap_events_file),
.seq_show = swap_events_show,
},
{ } /* terminate */
};
static struct cftype memsw_files[] = {
{
.name = "memsw.usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "memsw.max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
.write = mem_cgroup_reset,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "memsw.limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
.write = mem_cgroup_write,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "memsw.failcnt",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
.write = mem_cgroup_reset,
.read_u64 = mem_cgroup_read_u64,
},
{ }, /* terminate */
};
/*
* If mem_cgroup_swap_init() is implemented as a subsys_initcall()
* instead of a core_initcall(), this could mean cgroup_memory_noswap still
* remains set to false even when memcg is disabled via "cgroup_disable=memory"
* boot parameter. This may result in premature OOPS inside
* mem_cgroup_get_nr_swap_pages() function in corner cases.
*/
static int __init mem_cgroup_swap_init(void)
{
/* No memory control -> no swap control */
if (mem_cgroup_disabled())
cgroup_memory_noswap = true;
if (cgroup_memory_noswap)
return 0;
WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
return 0;
}
core_initcall(mem_cgroup_swap_init);
#endif /* CONFIG_MEMCG_SWAP */
#ifdef CONFIG_MEM_QOS
#define RMEM_UPDATE_FREQ 10
static void memcg_rmem_update_wmark(unsigned long rmem_size)
{
unsigned long limit = totalram_pages() << PAGE_SHIFT;
unsigned long setpoint;
/* XXX fix error case */
if (limit < rmem_size)
return;
setpoint = limit - rmem_size;
if (rmem_wmark_setpoint == setpoint)
return;
rmem_wmark_setpoint = setpoint;
if (setpoint >= rmem_size) {
rmem_wmark_freerun = setpoint - rmem_size;
rmem_wmark_limit = limit;
} else {
rmem_wmark_freerun = 0;
rmem_wmark_limit = setpoint + setpoint;
}
}
static void memcg_rmem_wmark_adjust(void)
{
memcg_rmem_update_wmark(prio_reclaim_bytes);
}
/*
* usage - setpoint 3
* f(usage) := 1.0 + (----------------)
* limit - setpoint
*
* it's a 3rd order polynomial that subjects to
*
* (1) f(limit) = 2.0
* (2) f(setpoint) = 1.0
* (3) f(freerun) = 0
*/
#define POS_RATIO_SETPOINT_VAL 1024l
#define POS_RATIO_WARN_OFFSET 16l
#define RATELIMIT_CALC_SHIFT 10
static long long pos_ratio_polynom(unsigned long setpoint,
unsigned long usage,
unsigned long limit)
{
long long pos_ratio;
long x;
x = div64_s64(((s64)usage - (s64)setpoint) << RATELIMIT_CALC_SHIFT,
(limit - setpoint) | 1);
pos_ratio = x;
pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT);
}
static void memcg_rmem_calc_pos_ratio(void)
{
unsigned long mem_used;
mem_used = (totalram_pages() - global_zone_page_state(NR_FREE_PAGES))
<< PAGE_SHIFT;
if (mem_used <= rmem_wmark_freerun) {
memcg_pos_ratio = 0;
} else {
memcg_pos_ratio = pos_ratio_polynom(rmem_wmark_setpoint,
(mem_used > rmem_wmark_limit) ? rmem_wmark_limit : mem_used,
rmem_wmark_limit);
}
}
static void memcg_expand_reclaim_prio(void)
{
while(memcg_cur_reclaim_prio > sysctl_vm_qos_highest_reclaim_prio) {
memcg_cur_reclaim_prio--;
if (atomic_long_read(&memcg_prios[memcg_cur_reclaim_prio].count))
break;
}
}
static void memcg_shrink_reclaim_prio(void)
{
while(memcg_cur_reclaim_prio < CGROUP_PRIORITY_MAX - 1) {
memcg_cur_reclaim_prio++;
if (atomic_long_read(&memcg_prios[memcg_cur_reclaim_prio].count))
break;
}
}
static void memcg_update_reclaim_prio(void)
{
static long last_pos_ratio;
spin_lock(&memcg_reclaim_prio_lock);
if (memcg_pos_ratio > POS_RATIO_SETPOINT_VAL + POS_RATIO_WARN_OFFSET &&
memcg_pos_ratio > last_pos_ratio) {
memcg_expand_reclaim_prio();
} else if (memcg_pos_ratio < POS_RATIO_SETPOINT_VAL - POS_RATIO_WARN_OFFSET &&
memcg_pos_ratio < last_pos_ratio) {
memcg_shrink_reclaim_prio();
}
last_pos_ratio = memcg_pos_ratio;
spin_unlock(&memcg_reclaim_prio_lock);
}
static int max_retry_times = 5;
static long memcg_prio_reclaim_async(void)
{
struct mem_cgroup *memcg;
int prio;
bool reclaim_succeed = false;
int nr_reclaimed;
int retry_times = 0;
int nr_reclaim_memcg, zero_reclaim_memcg;
if (atomic_long_read(&memcg_reclaimed_count) >= memcg_reclaim_goal)
return HZ;
retry:
retry_times++;
nr_reclaim_memcg = 0;
zero_reclaim_memcg = 0;
rcu_read_lock();
list_for_each_entry_rcu(memcg, &memcg_global_reclaim_list.list, prio_list_async) {
prio = memcg_get_prio(memcg);
if (prio < memcg_cur_reclaim_prio)
continue;
if (memcg_reclaim_goal > 0) {
nr_reclaim_memcg++;
nr_reclaimed = try_to_free_mem_cgroup_pages(memcg, memcg_reclaim_goal, GFP_KERNEL, true);
if (!mem_cgroup_notify_reclaim(memcg, nr_reclaimed))
break;
if (nr_reclaimed == 0)
zero_reclaim_memcg++;
if (atomic_long_read(&memcg_reclaimed_count) > memcg_reclaim_goal) {
reclaim_succeed = true;
break;
}
}
}
rcu_read_unlock();
if (reclaim_succeed)
return HZ / 2;
if (nr_reclaim_memcg == zero_reclaim_memcg) {
congestion_wait(BLK_RW_ASYNC, HZ/10);
return HZ;
}
if (retry_times <= max_retry_times)
goto retry;
return HZ / 10;
}
static int memcg_prio_reclaimd_async(void *data)
{
DEFINE_WAIT(wait_async);
// XXX simplify kthread
for ( ; ; ) {
long timeout;
timeout = memcg_prio_reclaim_async();
prepare_to_wait(&memcg_prio_reclaim_wq, &wait_async, TASK_INTERRUPTIBLE);
if (!kthread_should_stop())
schedule_timeout(timeout);
else {
finish_wait(&memcg_prio_reclaim_wq, &wait_async);
break;
}
finish_wait(&memcg_prio_reclaim_wq, &wait_async);
}
return 0;
}
static long memcg_prio_strategy(void)
{
#define RMEM_CHECK_PERIOD_MS 100
#define STRATEG_UPDATE_PERIOD_MS 1000
long timeout = MAX_SCHEDULE_TIMEOUT;
unsigned long alloc, goal = 0;
static unsigned long last_time;
if (!sysctl_vm_memory_qos || !memcg_reclaim_prio_exist())
goto out;
memcg_rmem_wmark_adjust();
memcg_rmem_calc_pos_ratio();
alloc = atomic_long_xchg(&memcg_allocated_count, 0);
if (memcg_pos_ratio <= 0) {
timeout = HZ / 2;
goto out;
} else {
timeout = msecs_to_jiffies(RMEM_CHECK_PERIOD_MS);
}
memcg_update_reclaim_prio();
if (time_after(jiffies, last_time + HZ))
alloc = 1;
goal = (alloc * memcg_pos_ratio) >> RATELIMIT_CALC_SHIFT;
atomic_long_xchg(&memcg_reclaimed_count, 0);
out:
memcg_reclaim_goal = goal;
last_time = jiffies;
return timeout;
}
static int memcg_prio_reclaimd(void *data)
{
DEFINE_WAIT(wait);
// XXX simplify kthread
for ( ; ; ) {
long timeout;
timeout = memcg_prio_strategy();
prepare_to_wait(&memcg_prio_reclaim_wq, &wait, TASK_INTERRUPTIBLE);
if (!kthread_should_stop())
schedule_timeout(timeout);
else {
finish_wait(&memcg_prio_reclaim_wq, &wait);
break;
}
finish_wait(&memcg_prio_reclaim_wq, &wait);
}
return 0;
}
static int memcg_prio_reclaimd_run(void)
{
int ret = 0;
if (!memcg_priod) {
memcg_priod = kthread_run(memcg_prio_reclaimd, NULL, "memcg_priod");
if (IS_ERR(memcg_priod)) {
pr_err("Failed to start memcg_prio_reclaimd thread\n");
ret = PTR_ERR(memcg_priod);
memcg_priod = NULL;
}
}
if (!memcg_priod_async) {
memcg_priod_async = kthread_run(memcg_prio_reclaimd_async, NULL, "memcg_priod_async");
if (IS_ERR(memcg_priod_async)) {
pr_err("Failed to start memcg_prio_reclaimd_async thread\n");
ret = PTR_ERR(memcg_priod_async);
memcg_priod_async = NULL;
}
}
return ret;
}
static bool mem_cgroup_prio_need_reclaim(struct mem_cgroup *memcg)
{
int prio = memcg_get_prio(memcg);
unsigned long reclaimed = atomic_long_read(&memcg_reclaimed_count);
if (prio < memcg_cur_reclaim_prio) {
current->mem_reclaimed = 0;
return false;
}
if (current->mem_reclaimed >= MEM_128M_PAGES) {
current->mem_reclaimed = 0;
return false;
}
if (reclaimed >= memcg_reclaim_goal) {
current->mem_reclaimed = 0;
return false;
}
return true;
}
static void mem_cgroup_notify_alloc(struct mem_cgroup *memcg, unsigned int nr_pages)
{
atomic_long_add(nr_pages, &memcg_allocated_count);
}
#define PRIORITY_RECLAIM_RETRY_MAX 3
static bool mem_cgroup_notify_reclaim(struct mem_cgroup *memcg, unsigned int nr_pages)
{
if (nr_pages) {
memcg->reclaim_failed = 0;
atomic_long_add(nr_pages, &memcg_reclaimed_count);
current->mem_reclaimed += nr_pages;
} else {
memcg->reclaim_failed++;
if (memcg->reclaim_failed >= PRIORITY_RECLAIM_RETRY_MAX) {
memcg->reclaim_failed = 0;
current->mem_reclaimed = 0;
return false;
}
}
return true;
}
extern unsigned long shrink_slab(gfp_t gfp_mask, int nid,
struct mem_cgroup *memcg,
int priority);
void reap_slab(struct mem_cgroup *memcg)
{
struct mem_cgroup *parent;
/*
* Offline memcg's kmem_cache had been moved to its parent memcg.
* so we must shrink its parent memcg.
*/
parent = parent_mem_cgroup(memcg);
if (parent) {
int nid;
unsigned long freed, count;
for_each_online_node(nid) {
freed = count = 0;
do {
count++;
freed = shrink_slab(GFP_KERNEL, nid, parent, 0);
} while (freed > 10 && count < 10);
}
}
}
static void clean_each_dying_memcg(struct mem_cgroup *memcg)
{
unsigned long current_pages;
int drained = 0;
unsigned int jiff_dirty_exp = HZ * dirty_expire_interval / 100;
if ((memcg_page_state(memcg, NR_WRITEBACK) +
memcg_page_state(memcg, NR_FILE_DIRTY))
&& time_after(memcg->offline_times +
jiff_dirty_exp, jiffies)) {
return;
}
current_pages = page_counter_read(&memcg->memory);
while (current_pages) {
unsigned int ret;
ret = try_to_free_mem_cgroup_pages(memcg, current_pages,
GFP_KERNEL, true);
if (ret)
goto next;
if (buff_wb_enabled())
reap_slab(memcg);
if (!drained) {
drain_all_stock(memcg);
drained = 1;
} else
break;
next:
current_pages = page_counter_read(&memcg->memory);
}
}
static void clean_all_dying_memcgs(void)
{
struct mem_cgroup *memcg;
for_each_mem_cgroup_tree(memcg, NULL) {
if (!mem_cgroup_online(memcg))
clean_each_dying_memcg(memcg);
cond_resched();
}
}
static int kclean_dying_memcgs(void *data)
{
DEFINE_WAIT(wait);
if (waitqueue_active(&kclean_dying_memcg_wq)) /* .. */
wake_up_interruptible(&kclean_dying_memcg_wq);
for ( ; ; ) {
clean_all_dying_memcgs();
prepare_to_wait(&kclean_dying_memcg_wq,
&wait, TASK_INTERRUPTIBLE);
if (!kthread_should_stop())
schedule();
else {
finish_wait(&kclean_dying_memcg_wq, &wait);
break;
}
finish_wait(&kclean_dying_memcg_wq, &wait);
}
return 0;
}
int kclean_dying_memcg_run(void)
{
int ret = 0;
if (kclean_dying_memcg)
return 0;
kclean_dying_memcg = kthread_run(kclean_dying_memcgs,
NULL, "kclean_dying_memcgs");
if (IS_ERR(kclean_dying_memcg)) {
pr_err("Failed to start kclean_dying_memcgs kthread.\n");
ret = PTR_ERR(kclean_dying_memcgs);
kclean_dying_memcg = NULL;
}
return ret;
}
void kclean_dying_memcg_stop(void)
{
if (kclean_dying_memcg) {
kthread_stop(kclean_dying_memcg);
kclean_dying_memcg = NULL;
}
}
#endif /* CONFIG_MEM_QOS */
#ifdef CONFIG_NEED_MEMCG_ZRAM
bool mem_cgroup_do_zram_account(void)
{
return !cgroup_memory_nokmem;
}
#endif /* CONFIG_NEED_MEMCG_ZRAM */