memcg: use CSS ID
Assigning CSS ID for each memcg and use css_get_next() for scanning hierarchy. Assume folloing tree. group_A (ID=3) /01 (ID=4) /0A (ID=7) /02 (ID=10) group_B (ID=5) and task in group_A/01/0A hits limit at group_A. reclaim will be done in following order (round-robin). group_A(3) -> group_A/01 (4) -> group_A/01/0A (7) -> group_A/02(10) -> group_A -> ..... Round robin by ID. The last visited cgroup is recorded and restart from it when it start reclaim again. (More smart algorithm can be implemented..) No cgroup_mutex or hierarchy_mutex is required. Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Paul Menage <menage@google.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
parent
b4046f00ee
commit
04046e1a0a
218
mm/memcontrol.c
218
mm/memcontrol.c
|
@ -95,6 +95,15 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat)
|
||||||
|
{
|
||||||
|
s64 ret;
|
||||||
|
|
||||||
|
ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE);
|
||||||
|
ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* per-zone information in memory controller.
|
* per-zone information in memory controller.
|
||||||
*/
|
*/
|
||||||
|
@ -154,9 +163,9 @@ struct mem_cgroup {
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* While reclaiming in a hiearchy, we cache the last child we
|
* While reclaiming in a hiearchy, we cache the last child we
|
||||||
* reclaimed from. Protected by hierarchy_mutex
|
* reclaimed from.
|
||||||
*/
|
*/
|
||||||
struct mem_cgroup *last_scanned_child;
|
int last_scanned_child;
|
||||||
/*
|
/*
|
||||||
* Should the accounting and control be hierarchical, per subtree?
|
* Should the accounting and control be hierarchical, per subtree?
|
||||||
*/
|
*/
|
||||||
|
@ -629,103 +638,6 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
|
||||||
#define mem_cgroup_from_res_counter(counter, member) \
|
#define mem_cgroup_from_res_counter(counter, member) \
|
||||||
container_of(counter, struct mem_cgroup, member)
|
container_of(counter, struct mem_cgroup, member)
|
||||||
|
|
||||||
/*
|
|
||||||
* This routine finds the DFS walk successor. This routine should be
|
|
||||||
* called with hierarchy_mutex held
|
|
||||||
*/
|
|
||||||
static struct mem_cgroup *
|
|
||||||
__mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
|
|
||||||
{
|
|
||||||
struct cgroup *cgroup, *curr_cgroup, *root_cgroup;
|
|
||||||
|
|
||||||
curr_cgroup = curr->css.cgroup;
|
|
||||||
root_cgroup = root_mem->css.cgroup;
|
|
||||||
|
|
||||||
if (!list_empty(&curr_cgroup->children)) {
|
|
||||||
/*
|
|
||||||
* Walk down to children
|
|
||||||
*/
|
|
||||||
cgroup = list_entry(curr_cgroup->children.next,
|
|
||||||
struct cgroup, sibling);
|
|
||||||
curr = mem_cgroup_from_cont(cgroup);
|
|
||||||
goto done;
|
|
||||||
}
|
|
||||||
|
|
||||||
visit_parent:
|
|
||||||
if (curr_cgroup == root_cgroup) {
|
|
||||||
/* caller handles NULL case */
|
|
||||||
curr = NULL;
|
|
||||||
goto done;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Goto next sibling
|
|
||||||
*/
|
|
||||||
if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) {
|
|
||||||
cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup,
|
|
||||||
sibling);
|
|
||||||
curr = mem_cgroup_from_cont(cgroup);
|
|
||||||
goto done;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Go up to next parent and next parent's sibling if need be
|
|
||||||
*/
|
|
||||||
curr_cgroup = curr_cgroup->parent;
|
|
||||||
goto visit_parent;
|
|
||||||
|
|
||||||
done:
|
|
||||||
return curr;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Visit the first child (need not be the first child as per the ordering
|
|
||||||
* of the cgroup list, since we track last_scanned_child) of @mem and use
|
|
||||||
* that to reclaim free pages from.
|
|
||||||
*/
|
|
||||||
static struct mem_cgroup *
|
|
||||||
mem_cgroup_get_next_node(struct mem_cgroup *root_mem)
|
|
||||||
{
|
|
||||||
struct cgroup *cgroup;
|
|
||||||
struct mem_cgroup *orig, *next;
|
|
||||||
bool obsolete;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Scan all children under the mem_cgroup mem
|
|
||||||
*/
|
|
||||||
mutex_lock(&mem_cgroup_subsys.hierarchy_mutex);
|
|
||||||
|
|
||||||
orig = root_mem->last_scanned_child;
|
|
||||||
obsolete = mem_cgroup_is_obsolete(orig);
|
|
||||||
|
|
||||||
if (list_empty(&root_mem->css.cgroup->children)) {
|
|
||||||
/*
|
|
||||||
* root_mem might have children before and last_scanned_child
|
|
||||||
* may point to one of them. We put it later.
|
|
||||||
*/
|
|
||||||
if (orig)
|
|
||||||
VM_BUG_ON(!obsolete);
|
|
||||||
next = NULL;
|
|
||||||
goto done;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!orig || obsolete) {
|
|
||||||
cgroup = list_first_entry(&root_mem->css.cgroup->children,
|
|
||||||
struct cgroup, sibling);
|
|
||||||
next = mem_cgroup_from_cont(cgroup);
|
|
||||||
} else
|
|
||||||
next = __mem_cgroup_get_next_node(orig, root_mem);
|
|
||||||
|
|
||||||
done:
|
|
||||||
if (next)
|
|
||||||
mem_cgroup_get(next);
|
|
||||||
root_mem->last_scanned_child = next;
|
|
||||||
if (orig)
|
|
||||||
mem_cgroup_put(orig);
|
|
||||||
mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex);
|
|
||||||
return (next) ? next : root_mem;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
|
static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
|
||||||
{
|
{
|
||||||
if (do_swap_account) {
|
if (do_swap_account) {
|
||||||
|
@ -755,46 +667,79 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Dance down the hierarchy if needed to reclaim memory. We remember the
|
* Visit the first child (need not be the first child as per the ordering
|
||||||
* last child we reclaimed from, so that we don't end up penalizing
|
* of the cgroup list, since we track last_scanned_child) of @mem and use
|
||||||
* one child extensively based on its position in the children list.
|
* that to reclaim free pages from.
|
||||||
|
*/
|
||||||
|
static struct mem_cgroup *
|
||||||
|
mem_cgroup_select_victim(struct mem_cgroup *root_mem)
|
||||||
|
{
|
||||||
|
struct mem_cgroup *ret = NULL;
|
||||||
|
struct cgroup_subsys_state *css;
|
||||||
|
int nextid, found;
|
||||||
|
|
||||||
|
if (!root_mem->use_hierarchy) {
|
||||||
|
css_get(&root_mem->css);
|
||||||
|
ret = root_mem;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (!ret) {
|
||||||
|
rcu_read_lock();
|
||||||
|
nextid = root_mem->last_scanned_child + 1;
|
||||||
|
css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
|
||||||
|
&found);
|
||||||
|
if (css && css_tryget(css))
|
||||||
|
ret = container_of(css, struct mem_cgroup, css);
|
||||||
|
|
||||||
|
rcu_read_unlock();
|
||||||
|
/* Updates scanning parameter */
|
||||||
|
spin_lock(&root_mem->reclaim_param_lock);
|
||||||
|
if (!css) {
|
||||||
|
/* this means start scan from ID:1 */
|
||||||
|
root_mem->last_scanned_child = 0;
|
||||||
|
} else
|
||||||
|
root_mem->last_scanned_child = found;
|
||||||
|
spin_unlock(&root_mem->reclaim_param_lock);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Scan the hierarchy if needed to reclaim memory. We remember the last child
|
||||||
|
* we reclaimed from, so that we don't end up penalizing one child extensively
|
||||||
|
* based on its position in the children list.
|
||||||
*
|
*
|
||||||
* root_mem is the original ancestor that we've been reclaim from.
|
* root_mem is the original ancestor that we've been reclaim from.
|
||||||
|
*
|
||||||
|
* We give up and return to the caller when we visit root_mem twice.
|
||||||
|
* (other groups can be removed while we're walking....)
|
||||||
*/
|
*/
|
||||||
static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
|
static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
|
||||||
gfp_t gfp_mask, bool noswap)
|
gfp_t gfp_mask, bool noswap)
|
||||||
{
|
{
|
||||||
struct mem_cgroup *next_mem;
|
struct mem_cgroup *victim;
|
||||||
int ret = 0;
|
int ret, total = 0;
|
||||||
|
int loop = 0;
|
||||||
|
|
||||||
/*
|
while (loop < 2) {
|
||||||
* Reclaim unconditionally and don't check for return value.
|
victim = mem_cgroup_select_victim(root_mem);
|
||||||
* We need to reclaim in the current group and down the tree.
|
if (victim == root_mem)
|
||||||
* One might think about checking for children before reclaiming,
|
loop++;
|
||||||
* but there might be left over accounting, even after children
|
if (!mem_cgroup_local_usage(&victim->stat)) {
|
||||||
* have left.
|
/* this cgroup's local usage == 0 */
|
||||||
*/
|
css_put(&victim->css);
|
||||||
ret += try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap,
|
|
||||||
get_swappiness(root_mem));
|
|
||||||
if (mem_cgroup_check_under_limit(root_mem))
|
|
||||||
return 1; /* indicate reclaim has succeeded */
|
|
||||||
if (!root_mem->use_hierarchy)
|
|
||||||
return ret;
|
|
||||||
|
|
||||||
next_mem = mem_cgroup_get_next_node(root_mem);
|
|
||||||
|
|
||||||
while (next_mem != root_mem) {
|
|
||||||
if (mem_cgroup_is_obsolete(next_mem)) {
|
|
||||||
next_mem = mem_cgroup_get_next_node(root_mem);
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
ret += try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap,
|
/* we use swappiness of local cgroup */
|
||||||
get_swappiness(next_mem));
|
ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap,
|
||||||
|
get_swappiness(victim));
|
||||||
|
css_put(&victim->css);
|
||||||
|
total += ret;
|
||||||
if (mem_cgroup_check_under_limit(root_mem))
|
if (mem_cgroup_check_under_limit(root_mem))
|
||||||
return 1; /* indicate reclaim has succeeded */
|
return 1 + total;
|
||||||
next_mem = mem_cgroup_get_next_node(root_mem);
|
|
||||||
}
|
}
|
||||||
return ret;
|
return total;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool mem_cgroup_oom_called(struct task_struct *task)
|
bool mem_cgroup_oom_called(struct task_struct *task)
|
||||||
|
@ -1324,8 +1269,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
|
||||||
res_counter_uncharge(&mem->res, PAGE_SIZE);
|
res_counter_uncharge(&mem->res, PAGE_SIZE);
|
||||||
if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
|
if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
|
||||||
res_counter_uncharge(&mem->memsw, PAGE_SIZE);
|
res_counter_uncharge(&mem->memsw, PAGE_SIZE);
|
||||||
|
|
||||||
mem_cgroup_charge_statistics(mem, pc, false);
|
mem_cgroup_charge_statistics(mem, pc, false);
|
||||||
|
|
||||||
ClearPageCgroupUsed(pc);
|
ClearPageCgroupUsed(pc);
|
||||||
/*
|
/*
|
||||||
* pc->mem_cgroup is not cleared here. It will be accessed when it's
|
* pc->mem_cgroup is not cleared here. It will be accessed when it's
|
||||||
|
@ -2178,6 +2123,8 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
|
||||||
{
|
{
|
||||||
int node;
|
int node;
|
||||||
|
|
||||||
|
free_css_id(&mem_cgroup_subsys, &mem->css);
|
||||||
|
|
||||||
for_each_node_state(node, N_POSSIBLE)
|
for_each_node_state(node, N_POSSIBLE)
|
||||||
free_mem_cgroup_per_zone_info(mem, node);
|
free_mem_cgroup_per_zone_info(mem, node);
|
||||||
|
|
||||||
|
@ -2228,11 +2175,12 @@ static struct cgroup_subsys_state * __ref
|
||||||
mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
|
mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
|
||||||
{
|
{
|
||||||
struct mem_cgroup *mem, *parent;
|
struct mem_cgroup *mem, *parent;
|
||||||
|
long error = -ENOMEM;
|
||||||
int node;
|
int node;
|
||||||
|
|
||||||
mem = mem_cgroup_alloc();
|
mem = mem_cgroup_alloc();
|
||||||
if (!mem)
|
if (!mem)
|
||||||
return ERR_PTR(-ENOMEM);
|
return ERR_PTR(error);
|
||||||
|
|
||||||
for_each_node_state(node, N_POSSIBLE)
|
for_each_node_state(node, N_POSSIBLE)
|
||||||
if (alloc_mem_cgroup_per_zone_info(mem, node))
|
if (alloc_mem_cgroup_per_zone_info(mem, node))
|
||||||
|
@ -2260,7 +2208,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
|
||||||
res_counter_init(&mem->res, NULL);
|
res_counter_init(&mem->res, NULL);
|
||||||
res_counter_init(&mem->memsw, NULL);
|
res_counter_init(&mem->memsw, NULL);
|
||||||
}
|
}
|
||||||
mem->last_scanned_child = NULL;
|
mem->last_scanned_child = 0;
|
||||||
spin_lock_init(&mem->reclaim_param_lock);
|
spin_lock_init(&mem->reclaim_param_lock);
|
||||||
|
|
||||||
if (parent)
|
if (parent)
|
||||||
|
@ -2269,7 +2217,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
|
||||||
return &mem->css;
|
return &mem->css;
|
||||||
free_out:
|
free_out:
|
||||||
__mem_cgroup_free(mem);
|
__mem_cgroup_free(mem);
|
||||||
return ERR_PTR(-ENOMEM);
|
return ERR_PTR(error);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
|
static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
|
||||||
|
@ -2284,12 +2232,7 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss,
|
||||||
struct cgroup *cont)
|
struct cgroup *cont)
|
||||||
{
|
{
|
||||||
struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
|
struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
|
||||||
struct mem_cgroup *last_scanned_child = mem->last_scanned_child;
|
|
||||||
|
|
||||||
if (last_scanned_child) {
|
|
||||||
VM_BUG_ON(!mem_cgroup_is_obsolete(last_scanned_child));
|
|
||||||
mem_cgroup_put(last_scanned_child);
|
|
||||||
}
|
|
||||||
mem_cgroup_put(mem);
|
mem_cgroup_put(mem);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2328,6 +2271,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
|
||||||
.populate = mem_cgroup_populate,
|
.populate = mem_cgroup_populate,
|
||||||
.attach = mem_cgroup_move_task,
|
.attach = mem_cgroup_move_task,
|
||||||
.early_init = 0,
|
.early_init = 0,
|
||||||
|
.use_id = 1,
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
|
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
|
||||||
|
|
Loading…
Reference in New Issue