memcg: remove the overhead associated with the root cgroup
Change the memory cgroup to remove the overhead associated with accounting all pages in the root cgroup. As a side-effect, we can no longer set a memory hard limit in the root cgroup. A new flag to track whether the page has been accounted or not has been added as well. Flags are now set atomically for page_cgroup, pcg_default_flags is now obsolete and removed. [akpm@linux-foundation.org: fix a few documentation glitches] Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Paul Menage <menage@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
parent
be367d0992
commit
4b3bde4c98
|
@ -179,6 +179,9 @@ The reclaim algorithm has not been modified for cgroups, except that
|
||||||
pages that are selected for reclaiming come from the per cgroup LRU
|
pages that are selected for reclaiming come from the per cgroup LRU
|
||||||
list.
|
list.
|
||||||
|
|
||||||
|
NOTE: Reclaim does not work for the root cgroup, since we cannot set any
|
||||||
|
limits on the root cgroup.
|
||||||
|
|
||||||
2. Locking
|
2. Locking
|
||||||
|
|
||||||
The memory controller uses the following hierarchy
|
The memory controller uses the following hierarchy
|
||||||
|
@ -210,6 +213,7 @@ We can alter the memory limit:
|
||||||
NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
|
NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
|
||||||
mega or gigabytes.
|
mega or gigabytes.
|
||||||
NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited).
|
NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited).
|
||||||
|
NOTE: We cannot set limits on the root cgroup any more.
|
||||||
|
|
||||||
# cat /cgroups/0/memory.limit_in_bytes
|
# cat /cgroups/0/memory.limit_in_bytes
|
||||||
4194304
|
4194304
|
||||||
|
|
|
@ -38,6 +38,7 @@ enum {
|
||||||
PCG_LOCK, /* page cgroup is locked */
|
PCG_LOCK, /* page cgroup is locked */
|
||||||
PCG_CACHE, /* charged as cache */
|
PCG_CACHE, /* charged as cache */
|
||||||
PCG_USED, /* this object is in use. */
|
PCG_USED, /* this object is in use. */
|
||||||
|
PCG_ACCT_LRU, /* page has been accounted for */
|
||||||
};
|
};
|
||||||
|
|
||||||
#define TESTPCGFLAG(uname, lname) \
|
#define TESTPCGFLAG(uname, lname) \
|
||||||
|
@ -52,11 +53,23 @@ static inline void SetPageCgroup##uname(struct page_cgroup *pc)\
|
||||||
static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \
|
static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \
|
||||||
{ clear_bit(PCG_##lname, &pc->flags); }
|
{ clear_bit(PCG_##lname, &pc->flags); }
|
||||||
|
|
||||||
|
#define TESTCLEARPCGFLAG(uname, lname) \
|
||||||
|
static inline int TestClearPageCgroup##uname(struct page_cgroup *pc) \
|
||||||
|
{ return test_and_clear_bit(PCG_##lname, &pc->flags); }
|
||||||
|
|
||||||
/* Cache flag is set only once (at allocation) */
|
/* Cache flag is set only once (at allocation) */
|
||||||
TESTPCGFLAG(Cache, CACHE)
|
TESTPCGFLAG(Cache, CACHE)
|
||||||
|
CLEARPCGFLAG(Cache, CACHE)
|
||||||
|
SETPCGFLAG(Cache, CACHE)
|
||||||
|
|
||||||
TESTPCGFLAG(Used, USED)
|
TESTPCGFLAG(Used, USED)
|
||||||
CLEARPCGFLAG(Used, USED)
|
CLEARPCGFLAG(Used, USED)
|
||||||
|
SETPCGFLAG(Used, USED)
|
||||||
|
|
||||||
|
SETPCGFLAG(AcctLRU, ACCT_LRU)
|
||||||
|
CLEARPCGFLAG(AcctLRU, ACCT_LRU)
|
||||||
|
TESTPCGFLAG(AcctLRU, ACCT_LRU)
|
||||||
|
TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU)
|
||||||
|
|
||||||
static inline int page_cgroup_nid(struct page_cgroup *pc)
|
static inline int page_cgroup_nid(struct page_cgroup *pc)
|
||||||
{
|
{
|
||||||
|
|
|
@ -43,6 +43,7 @@
|
||||||
|
|
||||||
struct cgroup_subsys mem_cgroup_subsys __read_mostly;
|
struct cgroup_subsys mem_cgroup_subsys __read_mostly;
|
||||||
#define MEM_CGROUP_RECLAIM_RETRIES 5
|
#define MEM_CGROUP_RECLAIM_RETRIES 5
|
||||||
|
struct mem_cgroup *root_mem_cgroup __read_mostly;
|
||||||
|
|
||||||
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
|
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
|
||||||
/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
|
/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
|
||||||
|
@ -200,13 +201,8 @@ enum charge_type {
|
||||||
#define PCGF_CACHE (1UL << PCG_CACHE)
|
#define PCGF_CACHE (1UL << PCG_CACHE)
|
||||||
#define PCGF_USED (1UL << PCG_USED)
|
#define PCGF_USED (1UL << PCG_USED)
|
||||||
#define PCGF_LOCK (1UL << PCG_LOCK)
|
#define PCGF_LOCK (1UL << PCG_LOCK)
|
||||||
static const unsigned long
|
/* Not used, but added here for completeness */
|
||||||
pcg_default_flags[NR_CHARGE_TYPE] = {
|
#define PCGF_ACCT (1UL << PCG_ACCT)
|
||||||
PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
|
|
||||||
PCGF_USED | PCGF_LOCK, /* Anon */
|
|
||||||
PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
|
|
||||||
0, /* FORCE */
|
|
||||||
};
|
|
||||||
|
|
||||||
/* for encoding cft->private value on file */
|
/* for encoding cft->private value on file */
|
||||||
#define _MEM (0)
|
#define _MEM (0)
|
||||||
|
@ -354,6 +350,11 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
|
||||||
|
{
|
||||||
|
return (mem == root_mem_cgroup);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Following LRU functions are allowed to be used without PCG_LOCK.
|
* Following LRU functions are allowed to be used without PCG_LOCK.
|
||||||
* Operations are called by routine of global LRU independently from memcg.
|
* Operations are called by routine of global LRU independently from memcg.
|
||||||
|
@ -371,22 +372,24 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
|
||||||
void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
|
void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
|
||||||
{
|
{
|
||||||
struct page_cgroup *pc;
|
struct page_cgroup *pc;
|
||||||
struct mem_cgroup *mem;
|
|
||||||
struct mem_cgroup_per_zone *mz;
|
struct mem_cgroup_per_zone *mz;
|
||||||
|
|
||||||
if (mem_cgroup_disabled())
|
if (mem_cgroup_disabled())
|
||||||
return;
|
return;
|
||||||
pc = lookup_page_cgroup(page);
|
pc = lookup_page_cgroup(page);
|
||||||
/* can happen while we handle swapcache. */
|
/* can happen while we handle swapcache. */
|
||||||
if (list_empty(&pc->lru) || !pc->mem_cgroup)
|
if (!TestClearPageCgroupAcctLRU(pc))
|
||||||
return;
|
return;
|
||||||
|
VM_BUG_ON(!pc->mem_cgroup);
|
||||||
/*
|
/*
|
||||||
* We don't check PCG_USED bit. It's cleared when the "page" is finally
|
* We don't check PCG_USED bit. It's cleared when the "page" is finally
|
||||||
* removed from global LRU.
|
* removed from global LRU.
|
||||||
*/
|
*/
|
||||||
mz = page_cgroup_zoneinfo(pc);
|
mz = page_cgroup_zoneinfo(pc);
|
||||||
mem = pc->mem_cgroup;
|
|
||||||
MEM_CGROUP_ZSTAT(mz, lru) -= 1;
|
MEM_CGROUP_ZSTAT(mz, lru) -= 1;
|
||||||
|
if (mem_cgroup_is_root(pc->mem_cgroup))
|
||||||
|
return;
|
||||||
|
VM_BUG_ON(list_empty(&pc->lru));
|
||||||
list_del_init(&pc->lru);
|
list_del_init(&pc->lru);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -410,8 +413,8 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
|
||||||
* For making pc->mem_cgroup visible, insert smp_rmb() here.
|
* For making pc->mem_cgroup visible, insert smp_rmb() here.
|
||||||
*/
|
*/
|
||||||
smp_rmb();
|
smp_rmb();
|
||||||
/* unused page is not rotated. */
|
/* unused or root page is not rotated. */
|
||||||
if (!PageCgroupUsed(pc))
|
if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
|
||||||
return;
|
return;
|
||||||
mz = page_cgroup_zoneinfo(pc);
|
mz = page_cgroup_zoneinfo(pc);
|
||||||
list_move(&pc->lru, &mz->lists[lru]);
|
list_move(&pc->lru, &mz->lists[lru]);
|
||||||
|
@ -425,6 +428,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
|
||||||
if (mem_cgroup_disabled())
|
if (mem_cgroup_disabled())
|
||||||
return;
|
return;
|
||||||
pc = lookup_page_cgroup(page);
|
pc = lookup_page_cgroup(page);
|
||||||
|
VM_BUG_ON(PageCgroupAcctLRU(pc));
|
||||||
/*
|
/*
|
||||||
* Used bit is set without atomic ops but after smp_wmb().
|
* Used bit is set without atomic ops but after smp_wmb().
|
||||||
* For making pc->mem_cgroup visible, insert smp_rmb() here.
|
* For making pc->mem_cgroup visible, insert smp_rmb() here.
|
||||||
|
@ -435,6 +439,9 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
|
||||||
|
|
||||||
mz = page_cgroup_zoneinfo(pc);
|
mz = page_cgroup_zoneinfo(pc);
|
||||||
MEM_CGROUP_ZSTAT(mz, lru) += 1;
|
MEM_CGROUP_ZSTAT(mz, lru) += 1;
|
||||||
|
SetPageCgroupAcctLRU(pc);
|
||||||
|
if (mem_cgroup_is_root(pc->mem_cgroup))
|
||||||
|
return;
|
||||||
list_add(&pc->lru, &mz->lists[lru]);
|
list_add(&pc->lru, &mz->lists[lru]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -469,7 +476,7 @@ static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
|
||||||
|
|
||||||
spin_lock_irqsave(&zone->lru_lock, flags);
|
spin_lock_irqsave(&zone->lru_lock, flags);
|
||||||
/* link when the page is linked to LRU but page_cgroup isn't */
|
/* link when the page is linked to LRU but page_cgroup isn't */
|
||||||
if (PageLRU(page) && list_empty(&pc->lru))
|
if (PageLRU(page) && !PageCgroupAcctLRU(pc))
|
||||||
mem_cgroup_add_lru_list(page, page_lru(page));
|
mem_cgroup_add_lru_list(page, page_lru(page));
|
||||||
spin_unlock_irqrestore(&zone->lru_lock, flags);
|
spin_unlock_irqrestore(&zone->lru_lock, flags);
|
||||||
}
|
}
|
||||||
|
@ -1125,9 +1132,22 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
|
||||||
css_put(&mem->css);
|
css_put(&mem->css);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
pc->mem_cgroup = mem;
|
pc->mem_cgroup = mem;
|
||||||
smp_wmb();
|
smp_wmb();
|
||||||
pc->flags = pcg_default_flags[ctype];
|
switch (ctype) {
|
||||||
|
case MEM_CGROUP_CHARGE_TYPE_CACHE:
|
||||||
|
case MEM_CGROUP_CHARGE_TYPE_SHMEM:
|
||||||
|
SetPageCgroupCache(pc);
|
||||||
|
SetPageCgroupUsed(pc);
|
||||||
|
break;
|
||||||
|
case MEM_CGROUP_CHARGE_TYPE_MAPPED:
|
||||||
|
ClearPageCgroupCache(pc);
|
||||||
|
SetPageCgroupUsed(pc);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
mem_cgroup_charge_statistics(mem, pc, true);
|
mem_cgroup_charge_statistics(mem, pc, true);
|
||||||
|
|
||||||
|
@ -2083,6 +2103,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
|
||||||
name = MEMFILE_ATTR(cft->private);
|
name = MEMFILE_ATTR(cft->private);
|
||||||
switch (name) {
|
switch (name) {
|
||||||
case RES_LIMIT:
|
case RES_LIMIT:
|
||||||
|
if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
|
||||||
|
ret = -EINVAL;
|
||||||
|
break;
|
||||||
|
}
|
||||||
/* This function does all necessary parse...reuse it */
|
/* This function does all necessary parse...reuse it */
|
||||||
ret = res_counter_memparse_write_strategy(buffer, &val);
|
ret = res_counter_memparse_write_strategy(buffer, &val);
|
||||||
if (ret)
|
if (ret)
|
||||||
|
@ -2549,6 +2573,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
|
||||||
if (cont->parent == NULL) {
|
if (cont->parent == NULL) {
|
||||||
enable_swap_cgroup();
|
enable_swap_cgroup();
|
||||||
parent = NULL;
|
parent = NULL;
|
||||||
|
root_mem_cgroup = mem;
|
||||||
} else {
|
} else {
|
||||||
parent = mem_cgroup_from_cont(cont->parent);
|
parent = mem_cgroup_from_cont(cont->parent);
|
||||||
mem->use_hierarchy = parent->use_hierarchy;
|
mem->use_hierarchy = parent->use_hierarchy;
|
||||||
|
@ -2577,6 +2602,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
|
||||||
return &mem->css;
|
return &mem->css;
|
||||||
free_out:
|
free_out:
|
||||||
__mem_cgroup_free(mem);
|
__mem_cgroup_free(mem);
|
||||||
|
root_mem_cgroup = NULL;
|
||||||
return ERR_PTR(error);
|
return ERR_PTR(error);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue