2008-02-07 16:13:50 +08:00
|
|
|
/* memcontrol.h - Memory Controller
|
|
|
|
*
|
|
|
|
* Copyright IBM Corporation, 2007
|
|
|
|
* Author Balbir Singh <balbir@linux.vnet.ibm.com>
|
|
|
|
*
|
2008-02-07 16:13:51 +08:00
|
|
|
* Copyright 2007 OpenVZ SWsoft Inc
|
|
|
|
* Author: Pavel Emelianov <xemul@openvz.org>
|
|
|
|
*
|
2008-02-07 16:13:50 +08:00
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
|
|
* (at your option) any later version.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef _LINUX_MEMCONTROL_H
|
|
|
|
#define _LINUX_MEMCONTROL_H
|
2009-01-08 10:08:02 +08:00
|
|
|
#include <linux/cgroup.h>
|
2011-05-27 07:25:38 +08:00
|
|
|
#include <linux/vm_event_item.h>
|
2012-12-19 06:21:56 +08:00
|
|
|
#include <linux/hardirq.h>
|
2012-12-19 06:22:09 +08:00
|
|
|
#include <linux/jump_label.h>
|
2011-05-27 07:25:38 +08:00
|
|
|
|
2008-02-07 16:13:51 +08:00
|
|
|
struct mem_cgroup;
|
|
|
|
struct page_cgroup;
|
2008-02-07 16:13:59 +08:00
|
|
|
struct page;
|
|
|
|
struct mm_struct;
|
2012-12-19 06:22:34 +08:00
|
|
|
struct kmem_cache;
|
2008-02-07 16:13:51 +08:00
|
|
|
|
2013-09-13 06:13:50 +08:00
|
|
|
/*
|
|
|
|
* The corresponding mem_cgroup_stat_names is defined in mm/memcontrol.c,
|
|
|
|
* These two lists should keep in accord with each other.
|
|
|
|
*/
|
|
|
|
enum mem_cgroup_stat_index {
|
|
|
|
/*
|
|
|
|
* For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
|
|
|
|
*/
|
|
|
|
MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
|
|
|
|
MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
|
|
|
|
MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */
|
|
|
|
MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
|
2013-09-13 06:13:53 +08:00
|
|
|
MEM_CGROUP_STAT_WRITEBACK, /* # of pages under writeback */
|
2013-09-13 06:13:50 +08:00
|
|
|
MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
|
|
|
|
MEM_CGROUP_STAT_NSTATS,
|
2011-01-14 07:47:37 +08:00
|
|
|
};
|
|
|
|
|
2012-01-13 09:17:59 +08:00
|
|
|
struct mem_cgroup_reclaim_cookie {
|
|
|
|
struct zone *zone;
|
|
|
|
int priority;
|
|
|
|
unsigned int generation;
|
|
|
|
};
|
|
|
|
|
2012-08-01 07:43:02 +08:00
|
|
|
#ifdef CONFIG_MEMCG
|
2009-01-08 10:08:10 +08:00
|
|
|
/*
|
|
|
|
* All "charge" functions with gfp_mask should use GFP_KERNEL or
|
|
|
|
* (gfp_mask & GFP_RECLAIM_MASK). In current implementatin, memcg doesn't
|
|
|
|
* alloc memory but reclaims memory from all available zones. So, "where I want
|
|
|
|
* memory from" bits of gfp_mask has no meaning. So any bits of that field is
|
|
|
|
* available but adding a rule is better. charge functions' gfp_mask should
|
|
|
|
* be set to GFP_KERNEL or gfp_mask & GFP_RECLAIM_MASK for avoiding ambiguous
|
|
|
|
* codes.
|
|
|
|
* (Of course, if memcg does memory allocation in future, GFP_KERNEL is sane.)
|
|
|
|
*/
|
2008-02-07 16:13:51 +08:00
|
|
|
|
2009-01-08 10:07:48 +08:00
|
|
|
extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm,
|
2008-02-07 16:14:02 +08:00
|
|
|
gfp_t gfp_mask);
|
2009-01-08 10:07:48 +08:00
|
|
|
/* for swap handling */
|
2009-01-08 10:08:00 +08:00
|
|
|
extern int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
|
2012-01-13 09:18:32 +08:00
|
|
|
struct page *page, gfp_t mask, struct mem_cgroup **memcgp);
|
2009-01-08 10:07:48 +08:00
|
|
|
extern void mem_cgroup_commit_charge_swapin(struct page *page,
|
2012-01-13 09:18:32 +08:00
|
|
|
struct mem_cgroup *memcg);
|
|
|
|
extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg);
|
2009-01-08 10:07:48 +08:00
|
|
|
|
2008-03-05 06:29:08 +08:00
|
|
|
extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
|
|
|
|
gfp_t gfp_mask);
|
2012-01-13 09:18:15 +08:00
|
|
|
|
|
|
|
struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
|
2012-05-30 06:07:09 +08:00
|
|
|
struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
|
2009-12-16 08:47:03 +08:00
|
|
|
|
|
|
|
/* For coalescing uncharge for reducing memcg' overhead*/
|
|
|
|
extern void mem_cgroup_uncharge_start(void);
|
|
|
|
extern void mem_cgroup_uncharge_end(void);
|
|
|
|
|
2008-02-07 16:14:41 +08:00
|
|
|
extern void mem_cgroup_uncharge_page(struct page *page);
|
memcg: remove refcnt from page_cgroup
memcg: performance improvements
Patch Description
1/5 ... remove refcnt fron page_cgroup patch (shmem handling is fixed)
2/5 ... swapcache handling patch
3/5 ... add helper function for shmem's memory reclaim patch
4/5 ... optimize by likely/unlikely ppatch
5/5 ... remove redundunt check patch (shmem handling is fixed.)
Unix bench result.
== 2.6.26-rc2-mm1 + memory resource controller
Execl Throughput 2915.4 lps (29.6 secs, 3 samples)
C Compiler Throughput 1019.3 lpm (60.0 secs, 3 samples)
Shell Scripts (1 concurrent) 5796.0 lpm (60.0 secs, 3 samples)
Shell Scripts (8 concurrent) 1097.7 lpm (60.0 secs, 3 samples)
Shell Scripts (16 concurrent) 565.3 lpm (60.0 secs, 3 samples)
File Read 1024 bufsize 2000 maxblocks 1022128.0 KBps (30.0 secs, 3 samples)
File Write 1024 bufsize 2000 maxblocks 544057.0 KBps (30.0 secs, 3 samples)
File Copy 1024 bufsize 2000 maxblocks 346481.0 KBps (30.0 secs, 3 samples)
File Read 256 bufsize 500 maxblocks 319325.0 KBps (30.0 secs, 3 samples)
File Write 256 bufsize 500 maxblocks 148788.0 KBps (30.0 secs, 3 samples)
File Copy 256 bufsize 500 maxblocks 99051.0 KBps (30.0 secs, 3 samples)
File Read 4096 bufsize 8000 maxblocks 2058917.0 KBps (30.0 secs, 3 samples)
File Write 4096 bufsize 8000 maxblocks 1606109.0 KBps (30.0 secs, 3 samples)
File Copy 4096 bufsize 8000 maxblocks 854789.0 KBps (30.0 secs, 3 samples)
Dc: sqrt(2) to 99 decimal places 126145.2 lpm (30.0 secs, 3 samples)
INDEX VALUES
TEST BASELINE RESULT INDEX
Execl Throughput 43.0 2915.4 678.0
File Copy 1024 bufsize 2000 maxblocks 3960.0 346481.0 875.0
File Copy 256 bufsize 500 maxblocks 1655.0 99051.0 598.5
File Copy 4096 bufsize 8000 maxblocks 5800.0 854789.0 1473.8
Shell Scripts (8 concurrent) 6.0 1097.7 1829.5
=========
FINAL SCORE 991.3
== 2.6.26-rc2-mm1 + this set ==
Execl Throughput 3012.9 lps (29.9 secs, 3 samples)
C Compiler Throughput 981.0 lpm (60.0 secs, 3 samples)
Shell Scripts (1 concurrent) 5872.0 lpm (60.0 secs, 3 samples)
Shell Scripts (8 concurrent) 1120.3 lpm (60.0 secs, 3 samples)
Shell Scripts (16 concurrent) 578.0 lpm (60.0 secs, 3 samples)
File Read 1024 bufsize 2000 maxblocks 1003993.0 KBps (30.0 secs, 3 samples)
File Write 1024 bufsize 2000 maxblocks 550452.0 KBps (30.0 secs, 3 samples)
File Copy 1024 bufsize 2000 maxblocks 347159.0 KBps (30.0 secs, 3 samples)
File Read 256 bufsize 500 maxblocks 314644.0 KBps (30.0 secs, 3 samples)
File Write 256 bufsize 500 maxblocks 151852.0 KBps (30.0 secs, 3 samples)
File Copy 256 bufsize 500 maxblocks 101000.0 KBps (30.0 secs, 3 samples)
File Read 4096 bufsize 8000 maxblocks 2033256.0 KBps (30.0 secs, 3 samples)
File Write 4096 bufsize 8000 maxblocks 1611814.0 KBps (30.0 secs, 3 samples)
File Copy 4096 bufsize 8000 maxblocks 847979.0 KBps (30.0 secs, 3 samples)
Dc: sqrt(2) to 99 decimal places 128148.7 lpm (30.0 secs, 3 samples)
INDEX VALUES
TEST BASELINE RESULT INDEX
Execl Throughput 43.0 3012.9 700.7
File Copy 1024 bufsize 2000 maxblocks 3960.0 347159.0 876.7
File Copy 256 bufsize 500 maxblocks 1655.0 101000.0 610.3
File Copy 4096 bufsize 8000 maxblocks 5800.0 847979.0 1462.0
Shell Scripts (8 concurrent) 6.0 1120.3 1867.2
=========
FINAL SCORE 1004.6
This patch:
Remove refcnt from page_cgroup().
After this,
* A page is charged only when !page_mapped() && no page_cgroup is assigned.
* Anon page is newly mapped.
* File page is added to mapping->tree.
* A page is uncharged only when
* Anon page is fully unmapped.
* File page is removed from LRU.
There is no change in behavior from user's view.
This patch also removes unnecessary calls in rmap.c which was used only for
refcnt mangement.
[akpm@linux-foundation.org: fix warning]
[hugh@veritas.com: fix shmem_unuse_inode charging]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Cc: Paul Menage <menage@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-25 16:47:14 +08:00
|
|
|
extern void mem_cgroup_uncharge_cache_page(struct page *page);
|
2008-07-25 16:47:15 +08:00
|
|
|
|
2012-05-30 06:06:25 +08:00
|
|
|
bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
|
|
|
|
struct mem_cgroup *memcg);
|
2013-07-04 06:01:23 +08:00
|
|
|
bool task_in_mem_cgroup(struct task_struct *task,
|
|
|
|
const struct mem_cgroup *memcg);
|
2008-02-07 16:14:03 +08:00
|
|
|
|
2009-12-16 19:19:59 +08:00
|
|
|
extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
|
cgroups: add an owner to the mm_struct
Remove the mem_cgroup member from mm_struct and instead adds an owner.
This approach was suggested by Paul Menage. The advantage of this approach
is that, once the mm->owner is known, using the subsystem id, the cgroup
can be determined. It also allows several control groups that are
virtually grouped by mm_struct, to exist independent of the memory
controller i.e., without adding mem_cgroup's for each controller, to
mm_struct.
A new config option CONFIG_MM_OWNER is added and the memory resource
controller selects this config option.
This patch also adds cgroup callbacks to notify subsystems when mm->owner
changes. The mm_cgroup_changed callback is called with the task_lock() of
the new task held and is called just prior to changing the mm->owner.
I am indebted to Paul Menage for the several reviews of this patchset and
helping me make it lighter and simpler.
This patch was tested on a powerpc box, it was compiled with both the
MM_OWNER config turned on and off.
After the thread group leader exits, it's moved to init_css_state by
cgroup_exit(), thus all future charges from runnings threads would be
redirected to the init_css_set's subsystem.
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Pavel Emelianov <xemul@openvz.org>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Sudhir Kumar <skumar@linux.vnet.ibm.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Cc: Hirokazu Takahashi <taka@valinux.co.jp>
Cc: David Rientjes <rientjes@google.com>,
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Reviewed-by: Paul Menage <menage@google.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-29 16:00:16 +08:00
|
|
|
extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
|
2011-06-16 06:08:13 +08:00
|
|
|
extern struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm);
|
cgroups: add an owner to the mm_struct
Remove the mem_cgroup member from mm_struct and instead adds an owner.
This approach was suggested by Paul Menage. The advantage of this approach
is that, once the mm->owner is known, using the subsystem id, the cgroup
can be determined. It also allows several control groups that are
virtually grouped by mm_struct, to exist independent of the memory
controller i.e., without adding mem_cgroup's for each controller, to
mm_struct.
A new config option CONFIG_MM_OWNER is added and the memory resource
controller selects this config option.
This patch also adds cgroup callbacks to notify subsystems when mm->owner
changes. The mm_cgroup_changed callback is called with the task_lock() of
the new task held and is called just prior to changing the mm->owner.
I am indebted to Paul Menage for the several reviews of this patchset and
helping me make it lighter and simpler.
This patch was tested on a powerpc box, it was compiled with both the
MM_OWNER config turned on and off.
After the thread group leader exits, it's moved to init_css_state by
cgroup_exit(), thus all future charges from runnings threads would be
redirected to the init_css_set's subsystem.
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Pavel Emelianov <xemul@openvz.org>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Sudhir Kumar <skumar@linux.vnet.ibm.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Cc: Hirokazu Takahashi <taka@valinux.co.jp>
Cc: David Rientjes <rientjes@google.com>,
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Reviewed-by: Paul Menage <menage@google.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-29 16:00:16 +08:00
|
|
|
|
2011-12-12 05:47:03 +08:00
|
|
|
extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
|
2013-08-09 08:11:24 +08:00
|
|
|
extern struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css);
|
2011-12-12 05:47:03 +08:00
|
|
|
|
2009-01-08 10:08:07 +08:00
|
|
|
static inline
|
2012-10-09 07:34:12 +08:00
|
|
|
bool mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *memcg)
|
2009-01-08 10:08:07 +08:00
|
|
|
{
|
2012-10-09 07:34:12 +08:00
|
|
|
struct mem_cgroup *task_memcg;
|
|
|
|
bool match;
|
2012-05-30 06:06:25 +08:00
|
|
|
|
2009-01-08 10:08:07 +08:00
|
|
|
rcu_read_lock();
|
2012-10-09 07:34:12 +08:00
|
|
|
task_memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
|
|
|
|
match = __mem_cgroup_same_or_subtree(memcg, task_memcg);
|
2009-01-08 10:08:07 +08:00
|
|
|
rcu_read_unlock();
|
2012-05-30 06:06:25 +08:00
|
|
|
return match;
|
2009-01-08 10:08:07 +08:00
|
|
|
}
|
2008-02-07 16:13:53 +08:00
|
|
|
|
2011-11-03 04:38:15 +08:00
|
|
|
extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
|
2009-12-16 19:19:59 +08:00
|
|
|
|
2012-08-01 07:45:25 +08:00
|
|
|
extern void
|
|
|
|
mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
|
|
|
|
struct mem_cgroup **memcgp);
|
2011-11-03 04:38:15 +08:00
|
|
|
extern void mem_cgroup_end_migration(struct mem_cgroup *memcg,
|
2011-01-14 07:47:43 +08:00
|
|
|
struct page *oldpage, struct page *newpage, bool migration_ok);
|
2008-02-07 16:14:10 +08:00
|
|
|
|
2013-09-25 06:27:37 +08:00
|
|
|
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
|
|
|
|
struct mem_cgroup *,
|
|
|
|
struct mem_cgroup_reclaim_cookie *);
|
2012-01-13 09:17:59 +08:00
|
|
|
void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
|
|
|
|
|
2008-02-07 16:14:32 +08:00
|
|
|
/*
|
|
|
|
* For memory reclaim.
|
|
|
|
*/
|
2012-05-30 06:07:00 +08:00
|
|
|
int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec);
|
2011-05-27 07:25:33 +08:00
|
|
|
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
|
2012-05-30 06:07:08 +08:00
|
|
|
unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
|
2012-05-30 06:07:09 +08:00
|
|
|
void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
|
2009-04-03 07:57:39 +08:00
|
|
|
extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
|
|
|
|
struct task_struct *p);
|
memcg: add mem_cgroup_replace_page_cache() to fix LRU issue
Commit ef6a3c6311 ("mm: add replace_page_cache_page() function") added a
function replace_page_cache_page(). This function replaces a page in the
radix-tree with a new page. WHen doing this, memory cgroup needs to fix
up the accounting information. memcg need to check PCG_USED bit etc.
In some(many?) cases, 'newpage' is on LRU before calling
replace_page_cache(). So, memcg's LRU accounting information should be
fixed, too.
This patch adds mem_cgroup_replace_page_cache() and removes the old hooks.
In that function, old pages will be unaccounted without touching
res_counter and new page will be accounted to the memcg (of old page).
WHen overwriting pc->mem_cgroup of newpage, take zone->lru_lock and avoid
races with LRU handling.
Background:
replace_page_cache_page() is called by FUSE code in its splice() handling.
Here, 'newpage' is replacing oldpage but this newpage is not a newly allocated
page and may be on LRU. LRU mis-accounting will be critical for memory cgroup
because rmdir() checks the whole LRU is empty and there is no account leak.
If a page is on the other LRU than it should be, rmdir() will fail.
This bug was added in March 2011, but no bug report yet. I guess there
are not many people who use memcg and FUSE at the same time with upstream
kernels.
The result of this bug is that admin cannot destroy a memcg because of
account leak. So, no panic, no deadlock. And, even if an active cgroup
exist, umount can succseed. So no problem at shutdown.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Miklos Szeredi <mszeredi@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-01-13 09:17:44 +08:00
|
|
|
extern void mem_cgroup_replace_page_cache(struct page *oldpage,
|
|
|
|
struct page *newpage);
|
2008-02-07 16:14:32 +08:00
|
|
|
|
2013-10-17 04:46:59 +08:00
|
|
|
static inline void mem_cgroup_oom_enable(void)
|
2013-09-13 06:13:42 +08:00
|
|
|
{
|
2013-10-17 04:46:59 +08:00
|
|
|
WARN_ON(current->memcg_oom.may_oom);
|
|
|
|
current->memcg_oom.may_oom = 1;
|
2013-09-13 06:13:42 +08:00
|
|
|
}
|
|
|
|
|
2013-10-17 04:46:59 +08:00
|
|
|
static inline void mem_cgroup_oom_disable(void)
|
2013-09-13 06:13:42 +08:00
|
|
|
{
|
2013-10-17 04:46:59 +08:00
|
|
|
WARN_ON(!current->memcg_oom.may_oom);
|
|
|
|
current->memcg_oom.may_oom = 0;
|
2013-09-13 06:13:42 +08:00
|
|
|
}
|
|
|
|
|
mm: memcg: do not trap chargers with full callstack on OOM
The memcg OOM handling is incredibly fragile and can deadlock. When a
task fails to charge memory, it invokes the OOM killer and loops right
there in the charge code until it succeeds. Comparably, any other task
that enters the charge path at this point will go to a waitqueue right
then and there and sleep until the OOM situation is resolved. The problem
is that these tasks may hold filesystem locks and the mmap_sem; locks that
the selected OOM victim may need to exit.
For example, in one reported case, the task invoking the OOM killer was
about to charge a page cache page during a write(), which holds the
i_mutex. The OOM killer selected a task that was just entering truncate()
and trying to acquire the i_mutex:
OOM invoking task:
mem_cgroup_handle_oom+0x241/0x3b0
mem_cgroup_cache_charge+0xbe/0xe0
add_to_page_cache_locked+0x4c/0x140
add_to_page_cache_lru+0x22/0x50
grab_cache_page_write_begin+0x8b/0xe0
ext3_write_begin+0x88/0x270
generic_file_buffered_write+0x116/0x290
__generic_file_aio_write+0x27c/0x480
generic_file_aio_write+0x76/0xf0 # takes ->i_mutex
do_sync_write+0xea/0x130
vfs_write+0xf3/0x1f0
sys_write+0x51/0x90
system_call_fastpath+0x18/0x1d
OOM kill victim:
do_truncate+0x58/0xa0 # takes i_mutex
do_last+0x250/0xa30
path_openat+0xd7/0x440
do_filp_open+0x49/0xa0
do_sys_open+0x106/0x240
sys_open+0x20/0x30
system_call_fastpath+0x18/0x1d
The OOM handling task will retry the charge indefinitely while the OOM
killed task is not releasing any resources.
A similar scenario can happen when the kernel OOM killer for a memcg is
disabled and a userspace task is in charge of resolving OOM situations.
In this case, ALL tasks that enter the OOM path will be made to sleep on
the OOM waitqueue and wait for userspace to free resources or increase
the group's limit. But a userspace OOM handler is prone to deadlock
itself on the locks held by the waiting tasks. For example one of the
sleeping tasks may be stuck in a brk() call with the mmap_sem held for
writing but the userspace handler, in order to pick an optimal victim,
may need to read files from /proc/<pid>, which tries to acquire the same
mmap_sem for reading and deadlocks.
This patch changes the way tasks behave after detecting a memcg OOM and
makes sure nobody loops or sleeps with locks held:
1. When OOMing in a user fault, invoke the OOM killer and restart the
fault instead of looping on the charge attempt. This way, the OOM
victim can not get stuck on locks the looping task may hold.
2. When OOMing in a user fault but somebody else is handling it
(either the kernel OOM killer or a userspace handler), don't go to
sleep in the charge context. Instead, remember the OOMing memcg in
the task struct and then fully unwind the page fault stack with
-ENOMEM. pagefault_out_of_memory() will then call back into the
memcg code to check if the -ENOMEM came from the memcg, and then
either put the task to sleep on the memcg's OOM waitqueue or just
restart the fault. The OOM victim can no longer get stuck on any
lock a sleeping task may hold.
Debugged by Michal Hocko.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: azurIt <azurit@pobox.sk>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-09-13 06:13:44 +08:00
|
|
|
static inline bool task_in_memcg_oom(struct task_struct *p)
|
|
|
|
{
|
2013-10-17 04:46:59 +08:00
|
|
|
return p->memcg_oom.memcg;
|
mm: memcg: do not trap chargers with full callstack on OOM
The memcg OOM handling is incredibly fragile and can deadlock. When a
task fails to charge memory, it invokes the OOM killer and loops right
there in the charge code until it succeeds. Comparably, any other task
that enters the charge path at this point will go to a waitqueue right
then and there and sleep until the OOM situation is resolved. The problem
is that these tasks may hold filesystem locks and the mmap_sem; locks that
the selected OOM victim may need to exit.
For example, in one reported case, the task invoking the OOM killer was
about to charge a page cache page during a write(), which holds the
i_mutex. The OOM killer selected a task that was just entering truncate()
and trying to acquire the i_mutex:
OOM invoking task:
mem_cgroup_handle_oom+0x241/0x3b0
mem_cgroup_cache_charge+0xbe/0xe0
add_to_page_cache_locked+0x4c/0x140
add_to_page_cache_lru+0x22/0x50
grab_cache_page_write_begin+0x8b/0xe0
ext3_write_begin+0x88/0x270
generic_file_buffered_write+0x116/0x290
__generic_file_aio_write+0x27c/0x480
generic_file_aio_write+0x76/0xf0 # takes ->i_mutex
do_sync_write+0xea/0x130
vfs_write+0xf3/0x1f0
sys_write+0x51/0x90
system_call_fastpath+0x18/0x1d
OOM kill victim:
do_truncate+0x58/0xa0 # takes i_mutex
do_last+0x250/0xa30
path_openat+0xd7/0x440
do_filp_open+0x49/0xa0
do_sys_open+0x106/0x240
sys_open+0x20/0x30
system_call_fastpath+0x18/0x1d
The OOM handling task will retry the charge indefinitely while the OOM
killed task is not releasing any resources.
A similar scenario can happen when the kernel OOM killer for a memcg is
disabled and a userspace task is in charge of resolving OOM situations.
In this case, ALL tasks that enter the OOM path will be made to sleep on
the OOM waitqueue and wait for userspace to free resources or increase
the group's limit. But a userspace OOM handler is prone to deadlock
itself on the locks held by the waiting tasks. For example one of the
sleeping tasks may be stuck in a brk() call with the mmap_sem held for
writing but the userspace handler, in order to pick an optimal victim,
may need to read files from /proc/<pid>, which tries to acquire the same
mmap_sem for reading and deadlocks.
This patch changes the way tasks behave after detecting a memcg OOM and
makes sure nobody loops or sleeps with locks held:
1. When OOMing in a user fault, invoke the OOM killer and restart the
fault instead of looping on the charge attempt. This way, the OOM
victim can not get stuck on locks the looping task may hold.
2. When OOMing in a user fault but somebody else is handling it
(either the kernel OOM killer or a userspace handler), don't go to
sleep in the charge context. Instead, remember the OOMing memcg in
the task struct and then fully unwind the page fault stack with
-ENOMEM. pagefault_out_of_memory() will then call back into the
memcg code to check if the -ENOMEM came from the memcg, and then
either put the task to sleep on the memcg's OOM waitqueue or just
restart the fault. The OOM victim can no longer get stuck on any
lock a sleeping task may hold.
Debugged by Michal Hocko.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: azurIt <azurit@pobox.sk>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-09-13 06:13:44 +08:00
|
|
|
}
|
|
|
|
|
2013-10-17 04:46:59 +08:00
|
|
|
bool mem_cgroup_oom_synchronize(bool wait);
|
mm: memcg: do not trap chargers with full callstack on OOM
The memcg OOM handling is incredibly fragile and can deadlock. When a
task fails to charge memory, it invokes the OOM killer and loops right
there in the charge code until it succeeds. Comparably, any other task
that enters the charge path at this point will go to a waitqueue right
then and there and sleep until the OOM situation is resolved. The problem
is that these tasks may hold filesystem locks and the mmap_sem; locks that
the selected OOM victim may need to exit.
For example, in one reported case, the task invoking the OOM killer was
about to charge a page cache page during a write(), which holds the
i_mutex. The OOM killer selected a task that was just entering truncate()
and trying to acquire the i_mutex:
OOM invoking task:
mem_cgroup_handle_oom+0x241/0x3b0
mem_cgroup_cache_charge+0xbe/0xe0
add_to_page_cache_locked+0x4c/0x140
add_to_page_cache_lru+0x22/0x50
grab_cache_page_write_begin+0x8b/0xe0
ext3_write_begin+0x88/0x270
generic_file_buffered_write+0x116/0x290
__generic_file_aio_write+0x27c/0x480
generic_file_aio_write+0x76/0xf0 # takes ->i_mutex
do_sync_write+0xea/0x130
vfs_write+0xf3/0x1f0
sys_write+0x51/0x90
system_call_fastpath+0x18/0x1d
OOM kill victim:
do_truncate+0x58/0xa0 # takes i_mutex
do_last+0x250/0xa30
path_openat+0xd7/0x440
do_filp_open+0x49/0xa0
do_sys_open+0x106/0x240
sys_open+0x20/0x30
system_call_fastpath+0x18/0x1d
The OOM handling task will retry the charge indefinitely while the OOM
killed task is not releasing any resources.
A similar scenario can happen when the kernel OOM killer for a memcg is
disabled and a userspace task is in charge of resolving OOM situations.
In this case, ALL tasks that enter the OOM path will be made to sleep on
the OOM waitqueue and wait for userspace to free resources or increase
the group's limit. But a userspace OOM handler is prone to deadlock
itself on the locks held by the waiting tasks. For example one of the
sleeping tasks may be stuck in a brk() call with the mmap_sem held for
writing but the userspace handler, in order to pick an optimal victim,
may need to read files from /proc/<pid>, which tries to acquire the same
mmap_sem for reading and deadlocks.
This patch changes the way tasks behave after detecting a memcg OOM and
makes sure nobody loops or sleeps with locks held:
1. When OOMing in a user fault, invoke the OOM killer and restart the
fault instead of looping on the charge attempt. This way, the OOM
victim can not get stuck on locks the looping task may hold.
2. When OOMing in a user fault but somebody else is handling it
(either the kernel OOM killer or a userspace handler), don't go to
sleep in the charge context. Instead, remember the OOMing memcg in
the task struct and then fully unwind the page fault stack with
-ENOMEM. pagefault_out_of_memory() will then call back into the
memcg code to check if the -ENOMEM came from the memcg, and then
either put the task to sleep on the memcg's OOM waitqueue or just
restart the fault. The OOM victim can no longer get stuck on any
lock a sleeping task may hold.
Debugged by Michal Hocko.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: azurIt <azurit@pobox.sk>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-09-13 06:13:44 +08:00
|
|
|
|
2012-08-01 07:43:02 +08:00
|
|
|
#ifdef CONFIG_MEMCG_SWAP
|
2009-01-08 10:07:57 +08:00
|
|
|
extern int do_swap_account;
|
|
|
|
#endif
|
2009-01-08 10:08:02 +08:00
|
|
|
|
|
|
|
static inline bool mem_cgroup_disabled(void)
|
|
|
|
{
|
|
|
|
if (mem_cgroup_subsys.disabled)
|
|
|
|
return true;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
memcg: use new logic for page stat accounting
Now, page-stat-per-memcg is recorded into per page_cgroup flag by
duplicating page's status into the flag. The reason is that memcg has a
feature to move a page from a group to another group and we have race
between "move" and "page stat accounting",
Under current logic, assume CPU-A and CPU-B. CPU-A does "move" and CPU-B
does "page stat accounting".
When CPU-A goes 1st,
CPU-A CPU-B
update "struct page" info.
move_lock_mem_cgroup(memcg)
see pc->flags
copy page stat to new group
overwrite pc->mem_cgroup.
move_unlock_mem_cgroup(memcg)
move_lock_mem_cgroup(mem)
set pc->flags
update page stat accounting
move_unlock_mem_cgroup(mem)
stat accounting is guarded by move_lock_mem_cgroup() and "move" logic
(CPU-A) doesn't see changes in "struct page" information.
But it's costly to have the same information both in 'struct page' and
'struct page_cgroup'. And, there is a potential problem.
For example, assume we have PG_dirty accounting in memcg.
PG_..is a flag for struct page.
PCG_ is a flag for struct page_cgroup.
(This is just an example. The same problem can be found in any
kind of page stat accounting.)
CPU-A CPU-B
TestSet PG_dirty
(delay) TestClear PG_dirty
if (TestClear(PCG_dirty))
memcg->nr_dirty--
if (TestSet(PCG_dirty))
memcg->nr_dirty++
Here, memcg->nr_dirty = +1, this is wrong. This race was reported by Greg
Thelen <gthelen@google.com>. Now, only FILE_MAPPED is supported but
fortunately, it's serialized by page table lock and this is not real bug,
_now_,
If this potential problem is caused by having duplicated information in
struct page and struct page_cgroup, we may be able to fix this by using
original 'struct page' information. But we'll have a problem in "move
account"
Assume we use only PG_dirty.
CPU-A CPU-B
TestSet PG_dirty
(delay) move_lock_mem_cgroup()
if (PageDirty(page))
new_memcg->nr_dirty++
pc->mem_cgroup = new_memcg;
move_unlock_mem_cgroup()
move_lock_mem_cgroup()
memcg = pc->mem_cgroup
new_memcg->nr_dirty++
accounting information may be double-counted. This was original reason to
have PCG_xxx flags but it seems PCG_xxx has another problem.
I think we need a bigger lock as
move_lock_mem_cgroup(page)
TestSetPageDirty(page)
update page stats (without any checks)
move_unlock_mem_cgroup(page)
This fixes both of problems and we don't have to duplicate page flag into
page_cgroup. Please note: move_lock_mem_cgroup() is held only when there
are possibility of "account move" under the system. So, in most path,
status update will go without atomic locks.
This patch introduces mem_cgroup_begin_update_page_stat() and
mem_cgroup_end_update_page_stat() both should be called at modifying
'struct page' information if memcg takes care of it. as
mem_cgroup_begin_update_page_stat()
modify page information
mem_cgroup_update_page_stat()
=> never check any 'struct page' info, just update counters.
mem_cgroup_end_update_page_stat().
This patch is slow because we need to call begin_update_page_stat()/
end_update_page_stat() regardless of accounted will be changed or not. A
following patch adds an easy optimization and reduces the cost.
[akpm@linux-foundation.org: s/lock/locked/]
[hughd@google.com: fix deadlock by avoiding stat lock when anon]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Greg Thelen <gthelen@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 07:34:25 +08:00
|
|
|
void __mem_cgroup_begin_update_page_stat(struct page *page, bool *locked,
|
|
|
|
unsigned long *flags);
|
|
|
|
|
2012-03-22 07:34:26 +08:00
|
|
|
extern atomic_t memcg_moving;
|
|
|
|
|
memcg: use new logic for page stat accounting
Now, page-stat-per-memcg is recorded into per page_cgroup flag by
duplicating page's status into the flag. The reason is that memcg has a
feature to move a page from a group to another group and we have race
between "move" and "page stat accounting",
Under current logic, assume CPU-A and CPU-B. CPU-A does "move" and CPU-B
does "page stat accounting".
When CPU-A goes 1st,
CPU-A CPU-B
update "struct page" info.
move_lock_mem_cgroup(memcg)
see pc->flags
copy page stat to new group
overwrite pc->mem_cgroup.
move_unlock_mem_cgroup(memcg)
move_lock_mem_cgroup(mem)
set pc->flags
update page stat accounting
move_unlock_mem_cgroup(mem)
stat accounting is guarded by move_lock_mem_cgroup() and "move" logic
(CPU-A) doesn't see changes in "struct page" information.
But it's costly to have the same information both in 'struct page' and
'struct page_cgroup'. And, there is a potential problem.
For example, assume we have PG_dirty accounting in memcg.
PG_..is a flag for struct page.
PCG_ is a flag for struct page_cgroup.
(This is just an example. The same problem can be found in any
kind of page stat accounting.)
CPU-A CPU-B
TestSet PG_dirty
(delay) TestClear PG_dirty
if (TestClear(PCG_dirty))
memcg->nr_dirty--
if (TestSet(PCG_dirty))
memcg->nr_dirty++
Here, memcg->nr_dirty = +1, this is wrong. This race was reported by Greg
Thelen <gthelen@google.com>. Now, only FILE_MAPPED is supported but
fortunately, it's serialized by page table lock and this is not real bug,
_now_,
If this potential problem is caused by having duplicated information in
struct page and struct page_cgroup, we may be able to fix this by using
original 'struct page' information. But we'll have a problem in "move
account"
Assume we use only PG_dirty.
CPU-A CPU-B
TestSet PG_dirty
(delay) move_lock_mem_cgroup()
if (PageDirty(page))
new_memcg->nr_dirty++
pc->mem_cgroup = new_memcg;
move_unlock_mem_cgroup()
move_lock_mem_cgroup()
memcg = pc->mem_cgroup
new_memcg->nr_dirty++
accounting information may be double-counted. This was original reason to
have PCG_xxx flags but it seems PCG_xxx has another problem.
I think we need a bigger lock as
move_lock_mem_cgroup(page)
TestSetPageDirty(page)
update page stats (without any checks)
move_unlock_mem_cgroup(page)
This fixes both of problems and we don't have to duplicate page flag into
page_cgroup. Please note: move_lock_mem_cgroup() is held only when there
are possibility of "account move" under the system. So, in most path,
status update will go without atomic locks.
This patch introduces mem_cgroup_begin_update_page_stat() and
mem_cgroup_end_update_page_stat() both should be called at modifying
'struct page' information if memcg takes care of it. as
mem_cgroup_begin_update_page_stat()
modify page information
mem_cgroup_update_page_stat()
=> never check any 'struct page' info, just update counters.
mem_cgroup_end_update_page_stat().
This patch is slow because we need to call begin_update_page_stat()/
end_update_page_stat() regardless of accounted will be changed or not. A
following patch adds an easy optimization and reduces the cost.
[akpm@linux-foundation.org: s/lock/locked/]
[hughd@google.com: fix deadlock by avoiding stat lock when anon]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Greg Thelen <gthelen@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 07:34:25 +08:00
|
|
|
static inline void mem_cgroup_begin_update_page_stat(struct page *page,
|
|
|
|
bool *locked, unsigned long *flags)
|
|
|
|
{
|
|
|
|
if (mem_cgroup_disabled())
|
|
|
|
return;
|
|
|
|
rcu_read_lock();
|
|
|
|
*locked = false;
|
2012-03-22 07:34:26 +08:00
|
|
|
if (atomic_read(&memcg_moving))
|
|
|
|
__mem_cgroup_begin_update_page_stat(page, locked, flags);
|
memcg: use new logic for page stat accounting
Now, page-stat-per-memcg is recorded into per page_cgroup flag by
duplicating page's status into the flag. The reason is that memcg has a
feature to move a page from a group to another group and we have race
between "move" and "page stat accounting",
Under current logic, assume CPU-A and CPU-B. CPU-A does "move" and CPU-B
does "page stat accounting".
When CPU-A goes 1st,
CPU-A CPU-B
update "struct page" info.
move_lock_mem_cgroup(memcg)
see pc->flags
copy page stat to new group
overwrite pc->mem_cgroup.
move_unlock_mem_cgroup(memcg)
move_lock_mem_cgroup(mem)
set pc->flags
update page stat accounting
move_unlock_mem_cgroup(mem)
stat accounting is guarded by move_lock_mem_cgroup() and "move" logic
(CPU-A) doesn't see changes in "struct page" information.
But it's costly to have the same information both in 'struct page' and
'struct page_cgroup'. And, there is a potential problem.
For example, assume we have PG_dirty accounting in memcg.
PG_..is a flag for struct page.
PCG_ is a flag for struct page_cgroup.
(This is just an example. The same problem can be found in any
kind of page stat accounting.)
CPU-A CPU-B
TestSet PG_dirty
(delay) TestClear PG_dirty
if (TestClear(PCG_dirty))
memcg->nr_dirty--
if (TestSet(PCG_dirty))
memcg->nr_dirty++
Here, memcg->nr_dirty = +1, this is wrong. This race was reported by Greg
Thelen <gthelen@google.com>. Now, only FILE_MAPPED is supported but
fortunately, it's serialized by page table lock and this is not real bug,
_now_,
If this potential problem is caused by having duplicated information in
struct page and struct page_cgroup, we may be able to fix this by using
original 'struct page' information. But we'll have a problem in "move
account"
Assume we use only PG_dirty.
CPU-A CPU-B
TestSet PG_dirty
(delay) move_lock_mem_cgroup()
if (PageDirty(page))
new_memcg->nr_dirty++
pc->mem_cgroup = new_memcg;
move_unlock_mem_cgroup()
move_lock_mem_cgroup()
memcg = pc->mem_cgroup
new_memcg->nr_dirty++
accounting information may be double-counted. This was original reason to
have PCG_xxx flags but it seems PCG_xxx has another problem.
I think we need a bigger lock as
move_lock_mem_cgroup(page)
TestSetPageDirty(page)
update page stats (without any checks)
move_unlock_mem_cgroup(page)
This fixes both of problems and we don't have to duplicate page flag into
page_cgroup. Please note: move_lock_mem_cgroup() is held only when there
are possibility of "account move" under the system. So, in most path,
status update will go without atomic locks.
This patch introduces mem_cgroup_begin_update_page_stat() and
mem_cgroup_end_update_page_stat() both should be called at modifying
'struct page' information if memcg takes care of it. as
mem_cgroup_begin_update_page_stat()
modify page information
mem_cgroup_update_page_stat()
=> never check any 'struct page' info, just update counters.
mem_cgroup_end_update_page_stat().
This patch is slow because we need to call begin_update_page_stat()/
end_update_page_stat() regardless of accounted will be changed or not. A
following patch adds an easy optimization and reduces the cost.
[akpm@linux-foundation.org: s/lock/locked/]
[hughd@google.com: fix deadlock by avoiding stat lock when anon]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Greg Thelen <gthelen@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 07:34:25 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void __mem_cgroup_end_update_page_stat(struct page *page,
|
|
|
|
unsigned long *flags);
|
|
|
|
static inline void mem_cgroup_end_update_page_stat(struct page *page,
|
|
|
|
bool *locked, unsigned long *flags)
|
|
|
|
{
|
|
|
|
if (mem_cgroup_disabled())
|
|
|
|
return;
|
|
|
|
if (*locked)
|
|
|
|
__mem_cgroup_end_update_page_stat(page, flags);
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
|
2011-01-14 07:47:37 +08:00
|
|
|
void mem_cgroup_update_page_stat(struct page *page,
|
2013-09-13 06:13:50 +08:00
|
|
|
enum mem_cgroup_stat_index idx,
|
2011-01-14 07:47:37 +08:00
|
|
|
int val);
|
|
|
|
|
|
|
|
static inline void mem_cgroup_inc_page_stat(struct page *page,
|
2013-09-13 06:13:50 +08:00
|
|
|
enum mem_cgroup_stat_index idx)
|
2011-01-14 07:47:37 +08:00
|
|
|
{
|
|
|
|
mem_cgroup_update_page_stat(page, idx, 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mem_cgroup_dec_page_stat(struct page *page,
|
2013-09-13 06:13:50 +08:00
|
|
|
enum mem_cgroup_stat_index idx)
|
2011-01-14 07:47:37 +08:00
|
|
|
{
|
|
|
|
mem_cgroup_update_page_stat(page, idx, -1);
|
|
|
|
}
|
|
|
|
|
2013-09-25 06:27:41 +08:00
|
|
|
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
|
|
|
|
gfp_t gfp_mask,
|
|
|
|
unsigned long *total_scanned);
|
oom: badness heuristic rewrite
This a complete rewrite of the oom killer's badness() heuristic which is
used to determine which task to kill in oom conditions. The goal is to
make it as simple and predictable as possible so the results are better
understood and we end up killing the task which will lead to the most
memory freeing while still respecting the fine-tuning from userspace.
Instead of basing the heuristic on mm->total_vm for each task, the task's
rss and swap space is used instead. This is a better indication of the
amount of memory that will be freeable if the oom killed task is chosen
and subsequently exits. This helps specifically in cases where KDE or
GNOME is chosen for oom kill on desktop systems instead of a memory
hogging task.
The baseline for the heuristic is a proportion of memory that each task is
currently using in memory plus swap compared to the amount of "allowable"
memory. "Allowable," in this sense, means the system-wide resources for
unconstrained oom conditions, the set of mempolicy nodes, the mems
attached to current's cpuset, or a memory controller's limit. The
proportion is given on a scale of 0 (never kill) to 1000 (always kill),
roughly meaning that if a task has a badness() score of 500 that the task
consumes approximately 50% of allowable memory resident in RAM or in swap
space.
The proportion is always relative to the amount of "allowable" memory and
not the total amount of RAM systemwide so that mempolicies and cpusets may
operate in isolation; they shall not need to know the true size of the
machine on which they are running if they are bound to a specific set of
nodes or mems, respectively.
Root tasks are given 3% extra memory just like __vm_enough_memory()
provides in LSMs. In the event of two tasks consuming similar amounts of
memory, it is generally better to save root's task.
Because of the change in the badness() heuristic's baseline, it is also
necessary to introduce a new user interface to tune it. It's not possible
to redefine the meaning of /proc/pid/oom_adj with a new scale since the
ABI cannot be changed for backward compatability. Instead, a new tunable,
/proc/pid/oom_score_adj, is added that ranges from -1000 to +1000. It may
be used to polarize the heuristic such that certain tasks are never
considered for oom kill while others may always be considered. The value
is added directly into the badness() score so a value of -500, for
example, means to discount 50% of its memory consumption in comparison to
other tasks either on the system, bound to the mempolicy, in the cpuset,
or sharing the same memory controller.
/proc/pid/oom_adj is changed so that its meaning is rescaled into the
units used by /proc/pid/oom_score_adj, and vice versa. Changing one of
these per-task tunables will rescale the value of the other to an
equivalent meaning. Although /proc/pid/oom_adj was originally defined as
a bitshift on the badness score, it now shares the same linear growth as
/proc/pid/oom_score_adj but with different granularity. This is required
so the ABI is not broken with userspace applications and allows oom_adj to
be deprecated for future removal.
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-08-10 08:19:46 +08:00
|
|
|
|
2012-12-13 05:51:57 +08:00
|
|
|
void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
|
|
|
|
static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
|
|
|
|
enum vm_event_item idx)
|
|
|
|
{
|
|
|
|
if (mem_cgroup_disabled())
|
|
|
|
return;
|
|
|
|
__mem_cgroup_count_vm_event(mm, idx);
|
|
|
|
}
|
2011-01-21 06:44:24 +08:00
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
2012-01-13 09:18:20 +08:00
|
|
|
void mem_cgroup_split_huge_fixup(struct page *head);
|
2011-01-21 06:44:24 +08:00
|
|
|
#endif
|
|
|
|
|
2011-03-24 07:42:25 +08:00
|
|
|
#ifdef CONFIG_DEBUG_VM
|
|
|
|
bool mem_cgroup_bad_page_check(struct page *page);
|
|
|
|
void mem_cgroup_print_bad_page(struct page *page);
|
|
|
|
#endif
|
2012-08-01 07:43:02 +08:00
|
|
|
#else /* CONFIG_MEMCG */
|
2009-01-08 10:07:48 +08:00
|
|
|
struct mem_cgroup;
|
|
|
|
|
|
|
|
static inline int mem_cgroup_newpage_charge(struct page *page,
|
2008-03-05 06:29:08 +08:00
|
|
|
struct mm_struct *mm, gfp_t gfp_mask)
|
2008-02-07 16:13:53 +08:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-03-05 06:29:08 +08:00
|
|
|
static inline int mem_cgroup_cache_charge(struct page *page,
|
|
|
|
struct mm_struct *mm, gfp_t gfp_mask)
|
2008-02-07 16:13:53 +08:00
|
|
|
{
|
2008-03-05 06:29:08 +08:00
|
|
|
return 0;
|
2008-02-07 16:13:53 +08:00
|
|
|
}
|
|
|
|
|
2009-01-08 10:08:00 +08:00
|
|
|
static inline int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
|
2012-01-13 09:18:32 +08:00
|
|
|
struct page *page, gfp_t gfp_mask, struct mem_cgroup **memcgp)
|
2009-01-08 10:07:48 +08:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mem_cgroup_commit_charge_swapin(struct page *page,
|
2012-01-13 09:18:32 +08:00
|
|
|
struct mem_cgroup *memcg)
|
2009-01-08 10:07:48 +08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2012-01-13 09:18:32 +08:00
|
|
|
static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
|
2009-01-08 10:07:48 +08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2009-12-16 08:47:03 +08:00
|
|
|
static inline void mem_cgroup_uncharge_start(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mem_cgroup_uncharge_end(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2008-02-07 16:13:53 +08:00
|
|
|
static inline void mem_cgroup_uncharge_page(struct page *page)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
memcg: remove refcnt from page_cgroup
memcg: performance improvements
Patch Description
1/5 ... remove refcnt fron page_cgroup patch (shmem handling is fixed)
2/5 ... swapcache handling patch
3/5 ... add helper function for shmem's memory reclaim patch
4/5 ... optimize by likely/unlikely ppatch
5/5 ... remove redundunt check patch (shmem handling is fixed.)
Unix bench result.
== 2.6.26-rc2-mm1 + memory resource controller
Execl Throughput 2915.4 lps (29.6 secs, 3 samples)
C Compiler Throughput 1019.3 lpm (60.0 secs, 3 samples)
Shell Scripts (1 concurrent) 5796.0 lpm (60.0 secs, 3 samples)
Shell Scripts (8 concurrent) 1097.7 lpm (60.0 secs, 3 samples)
Shell Scripts (16 concurrent) 565.3 lpm (60.0 secs, 3 samples)
File Read 1024 bufsize 2000 maxblocks 1022128.0 KBps (30.0 secs, 3 samples)
File Write 1024 bufsize 2000 maxblocks 544057.0 KBps (30.0 secs, 3 samples)
File Copy 1024 bufsize 2000 maxblocks 346481.0 KBps (30.0 secs, 3 samples)
File Read 256 bufsize 500 maxblocks 319325.0 KBps (30.0 secs, 3 samples)
File Write 256 bufsize 500 maxblocks 148788.0 KBps (30.0 secs, 3 samples)
File Copy 256 bufsize 500 maxblocks 99051.0 KBps (30.0 secs, 3 samples)
File Read 4096 bufsize 8000 maxblocks 2058917.0 KBps (30.0 secs, 3 samples)
File Write 4096 bufsize 8000 maxblocks 1606109.0 KBps (30.0 secs, 3 samples)
File Copy 4096 bufsize 8000 maxblocks 854789.0 KBps (30.0 secs, 3 samples)
Dc: sqrt(2) to 99 decimal places 126145.2 lpm (30.0 secs, 3 samples)
INDEX VALUES
TEST BASELINE RESULT INDEX
Execl Throughput 43.0 2915.4 678.0
File Copy 1024 bufsize 2000 maxblocks 3960.0 346481.0 875.0
File Copy 256 bufsize 500 maxblocks 1655.0 99051.0 598.5
File Copy 4096 bufsize 8000 maxblocks 5800.0 854789.0 1473.8
Shell Scripts (8 concurrent) 6.0 1097.7 1829.5
=========
FINAL SCORE 991.3
== 2.6.26-rc2-mm1 + this set ==
Execl Throughput 3012.9 lps (29.9 secs, 3 samples)
C Compiler Throughput 981.0 lpm (60.0 secs, 3 samples)
Shell Scripts (1 concurrent) 5872.0 lpm (60.0 secs, 3 samples)
Shell Scripts (8 concurrent) 1120.3 lpm (60.0 secs, 3 samples)
Shell Scripts (16 concurrent) 578.0 lpm (60.0 secs, 3 samples)
File Read 1024 bufsize 2000 maxblocks 1003993.0 KBps (30.0 secs, 3 samples)
File Write 1024 bufsize 2000 maxblocks 550452.0 KBps (30.0 secs, 3 samples)
File Copy 1024 bufsize 2000 maxblocks 347159.0 KBps (30.0 secs, 3 samples)
File Read 256 bufsize 500 maxblocks 314644.0 KBps (30.0 secs, 3 samples)
File Write 256 bufsize 500 maxblocks 151852.0 KBps (30.0 secs, 3 samples)
File Copy 256 bufsize 500 maxblocks 101000.0 KBps (30.0 secs, 3 samples)
File Read 4096 bufsize 8000 maxblocks 2033256.0 KBps (30.0 secs, 3 samples)
File Write 4096 bufsize 8000 maxblocks 1611814.0 KBps (30.0 secs, 3 samples)
File Copy 4096 bufsize 8000 maxblocks 847979.0 KBps (30.0 secs, 3 samples)
Dc: sqrt(2) to 99 decimal places 128148.7 lpm (30.0 secs, 3 samples)
INDEX VALUES
TEST BASELINE RESULT INDEX
Execl Throughput 43.0 3012.9 700.7
File Copy 1024 bufsize 2000 maxblocks 3960.0 347159.0 876.7
File Copy 256 bufsize 500 maxblocks 1655.0 101000.0 610.3
File Copy 4096 bufsize 8000 maxblocks 5800.0 847979.0 1462.0
Shell Scripts (8 concurrent) 6.0 1120.3 1867.2
=========
FINAL SCORE 1004.6
This patch:
Remove refcnt from page_cgroup().
After this,
* A page is charged only when !page_mapped() && no page_cgroup is assigned.
* Anon page is newly mapped.
* File page is added to mapping->tree.
* A page is uncharged only when
* Anon page is fully unmapped.
* File page is removed from LRU.
There is no change in behavior from user's view.
This patch also removes unnecessary calls in rmap.c which was used only for
refcnt mangement.
[akpm@linux-foundation.org: fix warning]
[hugh@veritas.com: fix shmem_unuse_inode charging]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Cc: Paul Menage <menage@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-25 16:47:14 +08:00
|
|
|
static inline void mem_cgroup_uncharge_cache_page(struct page *page)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2012-01-13 09:18:15 +08:00
|
|
|
static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
|
|
|
|
struct mem_cgroup *memcg)
|
memcg: synchronized LRU
A big patch for changing memcg's LRU semantics.
Now,
- page_cgroup is linked to mem_cgroup's its own LRU (per zone).
- LRU of page_cgroup is not synchronous with global LRU.
- page and page_cgroup is one-to-one and statically allocated.
- To find page_cgroup is on what LRU, you have to check pc->mem_cgroup as
- lru = page_cgroup_zoneinfo(pc, nid_of_pc, zid_of_pc);
- SwapCache is handled.
And, when we handle LRU list of page_cgroup, we do following.
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc); .....................(1)
mz = page_cgroup_zoneinfo(pc);
spin_lock(&mz->lru_lock);
.....add to LRU
spin_unlock(&mz->lru_lock);
unlock_page_cgroup(pc);
But (1) is spin_lock and we have to be afraid of dead-lock with zone->lru_lock.
So, trylock() is used at (1), now. Without (1), we can't trust "mz" is correct.
This is a trial to remove this dirty nesting of locks.
This patch changes mz->lru_lock to be zone->lru_lock.
Then, above sequence will be written as
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
mem_cgroup_add/remove/etc_lru() {
pc = lookup_page_cgroup(page);
mz = page_cgroup_zoneinfo(pc);
if (PageCgroupUsed(pc)) {
....add to LRU
}
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
This is much simpler.
(*) We're safe even if we don't take lock_page_cgroup(pc). Because..
1. When pc->mem_cgroup can be modified.
- at charge.
- at account_move().
2. at charge
the PCG_USED bit is not set before pc->mem_cgroup is fixed.
3. at account_move()
the page is isolated and not on LRU.
Pros.
- easy for maintenance.
- memcg can make use of laziness of pagevec.
- we don't have to duplicated LRU/Active/Unevictable bit in page_cgroup.
- LRU status of memcg will be synchronized with global LRU's one.
- # of locks are reduced.
- account_move() is simplified very much.
Cons.
- may increase cost of LRU rotation.
(no impact if memcg is not configured.)
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-08 10:08:01 +08:00
|
|
|
{
|
2012-01-13 09:18:15 +08:00
|
|
|
return &zone->lruvec;
|
memcg: synchronized LRU
A big patch for changing memcg's LRU semantics.
Now,
- page_cgroup is linked to mem_cgroup's its own LRU (per zone).
- LRU of page_cgroup is not synchronous with global LRU.
- page and page_cgroup is one-to-one and statically allocated.
- To find page_cgroup is on what LRU, you have to check pc->mem_cgroup as
- lru = page_cgroup_zoneinfo(pc, nid_of_pc, zid_of_pc);
- SwapCache is handled.
And, when we handle LRU list of page_cgroup, we do following.
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc); .....................(1)
mz = page_cgroup_zoneinfo(pc);
spin_lock(&mz->lru_lock);
.....add to LRU
spin_unlock(&mz->lru_lock);
unlock_page_cgroup(pc);
But (1) is spin_lock and we have to be afraid of dead-lock with zone->lru_lock.
So, trylock() is used at (1), now. Without (1), we can't trust "mz" is correct.
This is a trial to remove this dirty nesting of locks.
This patch changes mz->lru_lock to be zone->lru_lock.
Then, above sequence will be written as
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
mem_cgroup_add/remove/etc_lru() {
pc = lookup_page_cgroup(page);
mz = page_cgroup_zoneinfo(pc);
if (PageCgroupUsed(pc)) {
....add to LRU
}
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
This is much simpler.
(*) We're safe even if we don't take lock_page_cgroup(pc). Because..
1. When pc->mem_cgroup can be modified.
- at charge.
- at account_move().
2. at charge
the PCG_USED bit is not set before pc->mem_cgroup is fixed.
3. at account_move()
the page is isolated and not on LRU.
Pros.
- easy for maintenance.
- memcg can make use of laziness of pagevec.
- we don't have to duplicated LRU/Active/Unevictable bit in page_cgroup.
- LRU status of memcg will be synchronized with global LRU's one.
- # of locks are reduced.
- account_move() is simplified very much.
Cons.
- may increase cost of LRU rotation.
(no impact if memcg is not configured.)
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-08 10:08:01 +08:00
|
|
|
}
|
|
|
|
|
2012-05-30 06:07:09 +08:00
|
|
|
static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
|
|
|
|
struct zone *zone)
|
2008-02-07 16:13:56 +08:00
|
|
|
{
|
2012-01-13 09:18:15 +08:00
|
|
|
return &zone->lruvec;
|
2008-02-07 16:13:56 +08:00
|
|
|
}
|
|
|
|
|
2009-12-16 19:19:59 +08:00
|
|
|
static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2011-06-16 06:08:13 +08:00
|
|
|
static inline struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2012-10-09 07:34:12 +08:00
|
|
|
static inline bool mm_match_cgroup(struct mm_struct *mm,
|
2011-11-03 04:38:15 +08:00
|
|
|
struct mem_cgroup *memcg)
|
2008-02-07 16:14:01 +08:00
|
|
|
{
|
2012-10-09 07:34:12 +08:00
|
|
|
return true;
|
2008-02-07 16:14:01 +08:00
|
|
|
}
|
|
|
|
|
2013-07-04 06:01:23 +08:00
|
|
|
static inline bool task_in_mem_cgroup(struct task_struct *task,
|
|
|
|
const struct mem_cgroup *memcg)
|
2008-02-07 16:14:06 +08:00
|
|
|
{
|
2013-07-04 06:01:23 +08:00
|
|
|
return true;
|
2008-02-07 16:14:06 +08:00
|
|
|
}
|
|
|
|
|
2011-11-03 04:38:15 +08:00
|
|
|
static inline struct cgroup_subsys_state
|
|
|
|
*mem_cgroup_css(struct mem_cgroup *memcg)
|
2009-12-16 19:19:59 +08:00
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2012-08-01 07:45:25 +08:00
|
|
|
static inline void
|
memcg: fix mis-accounting of file mapped racy with migration
FILE_MAPPED per memcg of migrated file cache is not properly updated,
because our hook in page_add_file_rmap() can't know to which memcg
FILE_MAPPED should be counted.
Basically, this patch is for fixing the bug but includes some big changes
to fix up other messes.
Now, at migrating mapped file, events happen in following sequence.
1. allocate a new page.
2. get memcg of an old page.
3. charge ageinst a new page before migration. But at this point,
no changes to new page's page_cgroup, no commit for the charge.
(IOW, PCG_USED bit is not set.)
4. page migration replaces radix-tree, old-page and new-page.
5. page migration remaps the new page if the old page was mapped.
6. Here, the new page is unlocked.
7. memcg commits the charge for newpage, Mark the new page's page_cgroup
as PCG_USED.
Because "commit" happens after page-remap, we can count FILE_MAPPED
at "5", because we should avoid to trust page_cgroup->mem_cgroup.
if PCG_USED bit is unset.
(Note: memcg's LRU removal code does that but LRU-isolation logic is used
for helping it. When we overwrite page_cgroup->mem_cgroup, page_cgroup is
not on LRU or page_cgroup->mem_cgroup is NULL.)
We can lose file_mapped accounting information at 5 because FILE_MAPPED
is updated only when mapcount changes 0->1. So we should catch it.
BTW, historically, above implemntation comes from migration-failure
of anonymous page. Because we charge both of old page and new page
with mapcount=0, we can't catch
- the page is really freed before remap.
- migration fails but it's freed before remap
or .....corner cases.
New migration sequence with memcg is:
1. allocate a new page.
2. mark PageCgroupMigration to the old page.
3. charge against a new page onto the old page's memcg. (here, new page's pc
is marked as PageCgroupUsed.)
4. page migration replaces radix-tree, page table, etc...
5. At remapping, new page's page_cgroup is now makrked as "USED"
We can catch 0->1 event and FILE_MAPPED will be properly updated.
And we can catch SWAPOUT event after unlock this and freeing this
page by unmap() can be caught.
7. Clear PageCgroupMigration of the old page.
So, FILE_MAPPED will be correctly updated.
Then, for what MIGRATION flag is ?
Without it, at migration failure, we may have to charge old page again
because it may be fully unmapped. "charge" means that we have to dive into
memory reclaim or something complated. So, it's better to avoid
charge it again. Before this patch, __commit_charge() was working for
both of the old/new page and fixed up all. But this technique has some
racy condtion around FILE_MAPPED and SWAPOUT etc...
Now, the kernel use MIGRATION flag and don't uncharge old page until
the end of migration.
I hope this change will make memcg's page migration much simpler. This
page migration has caused several troubles. Worth to add a flag for
simplification.
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Reported-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-05-27 05:42:46 +08:00
|
|
|
mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
|
2012-08-01 07:45:25 +08:00
|
|
|
struct mem_cgroup **memcgp)
|
2008-02-07 16:14:10 +08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2011-11-03 04:38:15 +08:00
|
|
|
static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg,
|
2011-01-14 07:47:43 +08:00
|
|
|
struct page *oldpage, struct page *newpage, bool migration_ok)
|
2008-02-07 16:14:10 +08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2012-01-13 09:17:59 +08:00
|
|
|
static inline struct mem_cgroup *
|
|
|
|
mem_cgroup_iter(struct mem_cgroup *root,
|
|
|
|
struct mem_cgroup *prev,
|
|
|
|
struct mem_cgroup_reclaim_cookie *reclaim)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
|
|
|
|
struct mem_cgroup *prev)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2009-01-08 10:08:02 +08:00
|
|
|
static inline bool mem_cgroup_disabled(void)
|
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
2009-01-08 10:08:08 +08:00
|
|
|
|
2009-01-08 10:08:18 +08:00
|
|
|
static inline int
|
2012-05-30 06:07:00 +08:00
|
|
|
mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
|
2009-01-08 10:08:18 +08:00
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2009-01-08 10:08:19 +08:00
|
|
|
static inline unsigned long
|
2012-05-30 06:07:08 +08:00
|
|
|
mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
|
2009-01-08 10:08:19 +08:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2012-05-30 06:07:09 +08:00
|
|
|
static inline void
|
|
|
|
mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
|
|
|
|
int increment)
|
2009-01-08 10:08:20 +08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2009-04-03 07:57:39 +08:00
|
|
|
static inline void
|
|
|
|
mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
memcg: use new logic for page stat accounting
Now, page-stat-per-memcg is recorded into per page_cgroup flag by
duplicating page's status into the flag. The reason is that memcg has a
feature to move a page from a group to another group and we have race
between "move" and "page stat accounting",
Under current logic, assume CPU-A and CPU-B. CPU-A does "move" and CPU-B
does "page stat accounting".
When CPU-A goes 1st,
CPU-A CPU-B
update "struct page" info.
move_lock_mem_cgroup(memcg)
see pc->flags
copy page stat to new group
overwrite pc->mem_cgroup.
move_unlock_mem_cgroup(memcg)
move_lock_mem_cgroup(mem)
set pc->flags
update page stat accounting
move_unlock_mem_cgroup(mem)
stat accounting is guarded by move_lock_mem_cgroup() and "move" logic
(CPU-A) doesn't see changes in "struct page" information.
But it's costly to have the same information both in 'struct page' and
'struct page_cgroup'. And, there is a potential problem.
For example, assume we have PG_dirty accounting in memcg.
PG_..is a flag for struct page.
PCG_ is a flag for struct page_cgroup.
(This is just an example. The same problem can be found in any
kind of page stat accounting.)
CPU-A CPU-B
TestSet PG_dirty
(delay) TestClear PG_dirty
if (TestClear(PCG_dirty))
memcg->nr_dirty--
if (TestSet(PCG_dirty))
memcg->nr_dirty++
Here, memcg->nr_dirty = +1, this is wrong. This race was reported by Greg
Thelen <gthelen@google.com>. Now, only FILE_MAPPED is supported but
fortunately, it's serialized by page table lock and this is not real bug,
_now_,
If this potential problem is caused by having duplicated information in
struct page and struct page_cgroup, we may be able to fix this by using
original 'struct page' information. But we'll have a problem in "move
account"
Assume we use only PG_dirty.
CPU-A CPU-B
TestSet PG_dirty
(delay) move_lock_mem_cgroup()
if (PageDirty(page))
new_memcg->nr_dirty++
pc->mem_cgroup = new_memcg;
move_unlock_mem_cgroup()
move_lock_mem_cgroup()
memcg = pc->mem_cgroup
new_memcg->nr_dirty++
accounting information may be double-counted. This was original reason to
have PCG_xxx flags but it seems PCG_xxx has another problem.
I think we need a bigger lock as
move_lock_mem_cgroup(page)
TestSetPageDirty(page)
update page stats (without any checks)
move_unlock_mem_cgroup(page)
This fixes both of problems and we don't have to duplicate page flag into
page_cgroup. Please note: move_lock_mem_cgroup() is held only when there
are possibility of "account move" under the system. So, in most path,
status update will go without atomic locks.
This patch introduces mem_cgroup_begin_update_page_stat() and
mem_cgroup_end_update_page_stat() both should be called at modifying
'struct page' information if memcg takes care of it. as
mem_cgroup_begin_update_page_stat()
modify page information
mem_cgroup_update_page_stat()
=> never check any 'struct page' info, just update counters.
mem_cgroup_end_update_page_stat().
This patch is slow because we need to call begin_update_page_stat()/
end_update_page_stat() regardless of accounted will be changed or not. A
following patch adds an easy optimization and reduces the cost.
[akpm@linux-foundation.org: s/lock/locked/]
[hughd@google.com: fix deadlock by avoiding stat lock when anon]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Greg Thelen <gthelen@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 07:34:25 +08:00
|
|
|
static inline void mem_cgroup_begin_update_page_stat(struct page *page,
|
|
|
|
bool *locked, unsigned long *flags)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mem_cgroup_end_update_page_stat(struct page *page,
|
|
|
|
bool *locked, unsigned long *flags)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2013-10-17 04:46:59 +08:00
|
|
|
static inline void mem_cgroup_oom_enable(void)
|
2013-09-13 06:13:42 +08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2013-10-17 04:46:59 +08:00
|
|
|
static inline void mem_cgroup_oom_disable(void)
|
2013-09-13 06:13:42 +08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
mm: memcg: do not trap chargers with full callstack on OOM
The memcg OOM handling is incredibly fragile and can deadlock. When a
task fails to charge memory, it invokes the OOM killer and loops right
there in the charge code until it succeeds. Comparably, any other task
that enters the charge path at this point will go to a waitqueue right
then and there and sleep until the OOM situation is resolved. The problem
is that these tasks may hold filesystem locks and the mmap_sem; locks that
the selected OOM victim may need to exit.
For example, in one reported case, the task invoking the OOM killer was
about to charge a page cache page during a write(), which holds the
i_mutex. The OOM killer selected a task that was just entering truncate()
and trying to acquire the i_mutex:
OOM invoking task:
mem_cgroup_handle_oom+0x241/0x3b0
mem_cgroup_cache_charge+0xbe/0xe0
add_to_page_cache_locked+0x4c/0x140
add_to_page_cache_lru+0x22/0x50
grab_cache_page_write_begin+0x8b/0xe0
ext3_write_begin+0x88/0x270
generic_file_buffered_write+0x116/0x290
__generic_file_aio_write+0x27c/0x480
generic_file_aio_write+0x76/0xf0 # takes ->i_mutex
do_sync_write+0xea/0x130
vfs_write+0xf3/0x1f0
sys_write+0x51/0x90
system_call_fastpath+0x18/0x1d
OOM kill victim:
do_truncate+0x58/0xa0 # takes i_mutex
do_last+0x250/0xa30
path_openat+0xd7/0x440
do_filp_open+0x49/0xa0
do_sys_open+0x106/0x240
sys_open+0x20/0x30
system_call_fastpath+0x18/0x1d
The OOM handling task will retry the charge indefinitely while the OOM
killed task is not releasing any resources.
A similar scenario can happen when the kernel OOM killer for a memcg is
disabled and a userspace task is in charge of resolving OOM situations.
In this case, ALL tasks that enter the OOM path will be made to sleep on
the OOM waitqueue and wait for userspace to free resources or increase
the group's limit. But a userspace OOM handler is prone to deadlock
itself on the locks held by the waiting tasks. For example one of the
sleeping tasks may be stuck in a brk() call with the mmap_sem held for
writing but the userspace handler, in order to pick an optimal victim,
may need to read files from /proc/<pid>, which tries to acquire the same
mmap_sem for reading and deadlocks.
This patch changes the way tasks behave after detecting a memcg OOM and
makes sure nobody loops or sleeps with locks held:
1. When OOMing in a user fault, invoke the OOM killer and restart the
fault instead of looping on the charge attempt. This way, the OOM
victim can not get stuck on locks the looping task may hold.
2. When OOMing in a user fault but somebody else is handling it
(either the kernel OOM killer or a userspace handler), don't go to
sleep in the charge context. Instead, remember the OOMing memcg in
the task struct and then fully unwind the page fault stack with
-ENOMEM. pagefault_out_of_memory() will then call back into the
memcg code to check if the -ENOMEM came from the memcg, and then
either put the task to sleep on the memcg's OOM waitqueue or just
restart the fault. The OOM victim can no longer get stuck on any
lock a sleeping task may hold.
Debugged by Michal Hocko.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: azurIt <azurit@pobox.sk>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-09-13 06:13:44 +08:00
|
|
|
static inline bool task_in_memcg_oom(struct task_struct *p)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2013-10-17 04:46:59 +08:00
|
|
|
static inline bool mem_cgroup_oom_synchronize(bool wait)
|
mm: memcg: do not trap chargers with full callstack on OOM
The memcg OOM handling is incredibly fragile and can deadlock. When a
task fails to charge memory, it invokes the OOM killer and loops right
there in the charge code until it succeeds. Comparably, any other task
that enters the charge path at this point will go to a waitqueue right
then and there and sleep until the OOM situation is resolved. The problem
is that these tasks may hold filesystem locks and the mmap_sem; locks that
the selected OOM victim may need to exit.
For example, in one reported case, the task invoking the OOM killer was
about to charge a page cache page during a write(), which holds the
i_mutex. The OOM killer selected a task that was just entering truncate()
and trying to acquire the i_mutex:
OOM invoking task:
mem_cgroup_handle_oom+0x241/0x3b0
mem_cgroup_cache_charge+0xbe/0xe0
add_to_page_cache_locked+0x4c/0x140
add_to_page_cache_lru+0x22/0x50
grab_cache_page_write_begin+0x8b/0xe0
ext3_write_begin+0x88/0x270
generic_file_buffered_write+0x116/0x290
__generic_file_aio_write+0x27c/0x480
generic_file_aio_write+0x76/0xf0 # takes ->i_mutex
do_sync_write+0xea/0x130
vfs_write+0xf3/0x1f0
sys_write+0x51/0x90
system_call_fastpath+0x18/0x1d
OOM kill victim:
do_truncate+0x58/0xa0 # takes i_mutex
do_last+0x250/0xa30
path_openat+0xd7/0x440
do_filp_open+0x49/0xa0
do_sys_open+0x106/0x240
sys_open+0x20/0x30
system_call_fastpath+0x18/0x1d
The OOM handling task will retry the charge indefinitely while the OOM
killed task is not releasing any resources.
A similar scenario can happen when the kernel OOM killer for a memcg is
disabled and a userspace task is in charge of resolving OOM situations.
In this case, ALL tasks that enter the OOM path will be made to sleep on
the OOM waitqueue and wait for userspace to free resources or increase
the group's limit. But a userspace OOM handler is prone to deadlock
itself on the locks held by the waiting tasks. For example one of the
sleeping tasks may be stuck in a brk() call with the mmap_sem held for
writing but the userspace handler, in order to pick an optimal victim,
may need to read files from /proc/<pid>, which tries to acquire the same
mmap_sem for reading and deadlocks.
This patch changes the way tasks behave after detecting a memcg OOM and
makes sure nobody loops or sleeps with locks held:
1. When OOMing in a user fault, invoke the OOM killer and restart the
fault instead of looping on the charge attempt. This way, the OOM
victim can not get stuck on locks the looping task may hold.
2. When OOMing in a user fault but somebody else is handling it
(either the kernel OOM killer or a userspace handler), don't go to
sleep in the charge context. Instead, remember the OOMing memcg in
the task struct and then fully unwind the page fault stack with
-ENOMEM. pagefault_out_of_memory() will then call back into the
memcg code to check if the -ENOMEM came from the memcg, and then
either put the task to sleep on the memcg's OOM waitqueue or just
restart the fault. The OOM victim can no longer get stuck on any
lock a sleeping task may hold.
Debugged by Michal Hocko.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: azurIt <azurit@pobox.sk>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-09-13 06:13:44 +08:00
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2011-01-14 07:47:37 +08:00
|
|
|
static inline void mem_cgroup_inc_page_stat(struct page *page,
|
2013-09-13 06:13:50 +08:00
|
|
|
enum mem_cgroup_stat_index idx)
|
2011-01-14 07:47:37 +08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mem_cgroup_dec_page_stat(struct page *page,
|
2013-09-13 06:13:50 +08:00
|
|
|
enum mem_cgroup_stat_index idx)
|
2009-06-18 07:26:34 +08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2009-09-24 06:56:39 +08:00
|
|
|
static inline
|
2013-09-25 06:27:41 +08:00
|
|
|
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
|
|
|
|
gfp_t gfp_mask,
|
|
|
|
unsigned long *total_scanned)
|
2009-09-24 06:56:39 +08:00
|
|
|
{
|
2013-09-25 06:27:41 +08:00
|
|
|
return 0;
|
2009-09-24 06:56:39 +08:00
|
|
|
}
|
|
|
|
|
2012-01-13 09:18:20 +08:00
|
|
|
static inline void mem_cgroup_split_huge_fixup(struct page *head)
|
2011-01-21 06:44:24 +08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2011-05-27 07:25:38 +08:00
|
|
|
static inline
|
|
|
|
void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
|
|
|
|
{
|
|
|
|
}
|
memcg: add mem_cgroup_replace_page_cache() to fix LRU issue
Commit ef6a3c6311 ("mm: add replace_page_cache_page() function") added a
function replace_page_cache_page(). This function replaces a page in the
radix-tree with a new page. WHen doing this, memory cgroup needs to fix
up the accounting information. memcg need to check PCG_USED bit etc.
In some(many?) cases, 'newpage' is on LRU before calling
replace_page_cache(). So, memcg's LRU accounting information should be
fixed, too.
This patch adds mem_cgroup_replace_page_cache() and removes the old hooks.
In that function, old pages will be unaccounted without touching
res_counter and new page will be accounted to the memcg (of old page).
WHen overwriting pc->mem_cgroup of newpage, take zone->lru_lock and avoid
races with LRU handling.
Background:
replace_page_cache_page() is called by FUSE code in its splice() handling.
Here, 'newpage' is replacing oldpage but this newpage is not a newly allocated
page and may be on LRU. LRU mis-accounting will be critical for memory cgroup
because rmdir() checks the whole LRU is empty and there is no account leak.
If a page is on the other LRU than it should be, rmdir() will fail.
This bug was added in March 2011, but no bug report yet. I guess there
are not many people who use memcg and FUSE at the same time with upstream
kernels.
The result of this bug is that admin cannot destroy a memcg because of
account leak. So, no panic, no deadlock. And, even if an active cgroup
exist, umount can succseed. So no problem at shutdown.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Miklos Szeredi <mszeredi@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-01-13 09:17:44 +08:00
|
|
|
static inline void mem_cgroup_replace_page_cache(struct page *oldpage,
|
|
|
|
struct page *newpage)
|
|
|
|
{
|
|
|
|
}
|
2012-08-01 07:43:02 +08:00
|
|
|
#endif /* CONFIG_MEMCG */
|
2008-02-07 16:13:51 +08:00
|
|
|
|
2012-08-01 07:43:02 +08:00
|
|
|
#if !defined(CONFIG_MEMCG) || !defined(CONFIG_DEBUG_VM)
|
2011-03-24 07:42:25 +08:00
|
|
|
static inline bool
|
|
|
|
mem_cgroup_bad_page_check(struct page *page)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void
|
|
|
|
mem_cgroup_print_bad_page(struct page *page)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2011-12-12 05:47:03 +08:00
|
|
|
enum {
|
|
|
|
UNDER_LIMIT,
|
|
|
|
SOFT_LIMIT,
|
|
|
|
OVER_LIMIT,
|
|
|
|
};
|
|
|
|
|
|
|
|
struct sock;
|
2012-10-11 06:54:08 +08:00
|
|
|
#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
|
2011-12-12 05:47:03 +08:00
|
|
|
void sock_update_memcg(struct sock *sk);
|
|
|
|
void sock_release_memcg(struct sock *sk);
|
|
|
|
#else
|
|
|
|
static inline void sock_update_memcg(struct sock *sk)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
static inline void sock_release_memcg(struct sock *sk)
|
|
|
|
{
|
|
|
|
}
|
2012-10-11 06:54:08 +08:00
|
|
|
#endif /* CONFIG_INET && CONFIG_MEMCG_KMEM */
|
2012-12-19 06:21:56 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_MEMCG_KMEM
|
2012-12-19 06:22:09 +08:00
|
|
|
extern struct static_key memcg_kmem_enabled_key;
|
2012-12-19 06:23:01 +08:00
|
|
|
|
|
|
|
extern int memcg_limited_groups_array_size;
|
2012-12-19 06:23:10 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Helper macro to loop through all memcg-specific caches. Callers must still
|
|
|
|
* check if the cache is valid (it is either valid or NULL).
|
|
|
|
* the slab_mutex must be held when looping through those caches
|
|
|
|
*/
|
2012-12-19 06:23:01 +08:00
|
|
|
#define for_each_memcg_cache_index(_idx) \
|
2013-02-05 06:28:49 +08:00
|
|
|
for ((_idx) = 0; (_idx) < memcg_limited_groups_array_size; (_idx)++)
|
2012-12-19 06:23:01 +08:00
|
|
|
|
2012-12-19 06:21:56 +08:00
|
|
|
static inline bool memcg_kmem_enabled(void)
|
|
|
|
{
|
2012-12-19 06:22:09 +08:00
|
|
|
return static_key_false(&memcg_kmem_enabled_key);
|
2012-12-19 06:21:56 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* In general, we'll do everything in our power to not incur in any overhead
|
|
|
|
* for non-memcg users for the kmem functions. Not even a function call, if we
|
|
|
|
* can avoid it.
|
|
|
|
*
|
|
|
|
* Therefore, we'll inline all those functions so that in the best case, we'll
|
|
|
|
* see that kmemcg is off for everybody and proceed quickly. If it is on,
|
|
|
|
* we'll still do most of the flag checking inline. We check a lot of
|
|
|
|
* conditions, but because they are pretty simple, they are expected to be
|
|
|
|
* fast.
|
|
|
|
*/
|
|
|
|
bool __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg,
|
|
|
|
int order);
|
|
|
|
void __memcg_kmem_commit_charge(struct page *page,
|
|
|
|
struct mem_cgroup *memcg, int order);
|
|
|
|
void __memcg_kmem_uncharge_pages(struct page *page, int order);
|
|
|
|
|
2012-12-19 06:22:34 +08:00
|
|
|
int memcg_cache_id(struct mem_cgroup *memcg);
|
2014-01-24 07:52:56 +08:00
|
|
|
int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
|
|
|
|
struct kmem_cache *root_cache);
|
|
|
|
void memcg_free_cache_params(struct kmem_cache *s);
|
2012-12-19 06:22:34 +08:00
|
|
|
void memcg_release_cache(struct kmem_cache *cachep);
|
|
|
|
void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep);
|
|
|
|
|
memcg: allocate memory for memcg caches whenever a new memcg appears
Every cache that is considered a root cache (basically the "original"
caches, tied to the root memcg/no-memcg) will have an array that should be
large enough to store a cache pointer per each memcg in the system.
Theoreticaly, this is as high as 1 << sizeof(css_id), which is currently
in the 64k pointers range. Most of the time, we won't be using that much.
What goes in this patch, is a simple scheme to dynamically allocate such
an array, in order to minimize memory usage for memcg caches. Because we
would also like to avoid allocations all the time, at least for now, the
array will only grow. It will tend to be big enough to hold the maximum
number of kmem-limited memcgs ever achieved.
We'll allocate it to be a minimum of 64 kmem-limited memcgs. When we have
more than that, we'll start doubling the size of this array every time the
limit is reached.
Because we are only considering kmem limited memcgs, a natural point for
this to happen is when we write to the limit. At that point, we already
have set_limit_mutex held, so that will become our natural synchronization
mechanism.
Signed-off-by: Glauber Costa <glommer@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Frederic Weisbecker <fweisbec@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: JoonSoo Kim <js1304@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Rik van Riel <riel@redhat.com>
Cc: Suleiman Souhlal <suleiman@google.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-12-19 06:22:38 +08:00
|
|
|
int memcg_update_cache_size(struct kmem_cache *s, int num_groups);
|
|
|
|
void memcg_update_array_size(int num_groups);
|
2012-12-19 06:22:40 +08:00
|
|
|
|
|
|
|
struct kmem_cache *
|
|
|
|
__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp);
|
|
|
|
|
2012-12-19 06:22:50 +08:00
|
|
|
void mem_cgroup_destroy_cache(struct kmem_cache *cachep);
|
2012-12-19 06:22:55 +08:00
|
|
|
void kmem_cache_destroy_memcg_children(struct kmem_cache *s);
|
2012-12-19 06:22:50 +08:00
|
|
|
|
2012-12-19 06:21:56 +08:00
|
|
|
/**
|
|
|
|
* memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed.
|
|
|
|
* @gfp: the gfp allocation flags.
|
|
|
|
* @memcg: a pointer to the memcg this was charged against.
|
|
|
|
* @order: allocation order.
|
|
|
|
*
|
|
|
|
* returns true if the memcg where the current task belongs can hold this
|
|
|
|
* allocation.
|
|
|
|
*
|
|
|
|
* We return true automatically if this allocation is not to be accounted to
|
|
|
|
* any memcg.
|
|
|
|
*/
|
|
|
|
static inline bool
|
|
|
|
memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
|
|
|
|
{
|
|
|
|
if (!memcg_kmem_enabled())
|
|
|
|
return true;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* __GFP_NOFAIL allocations will move on even if charging is not
|
|
|
|
* possible. Therefore we don't even try, and have this allocation
|
|
|
|
* unaccounted. We could in theory charge it with
|
|
|
|
* res_counter_charge_nofail, but we hope those allocations are rare,
|
|
|
|
* and won't be worth the trouble.
|
|
|
|
*/
|
|
|
|
if (!(gfp & __GFP_KMEMCG) || (gfp & __GFP_NOFAIL))
|
|
|
|
return true;
|
|
|
|
if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
/* If the test is dying, just let it go. */
|
|
|
|
if (unlikely(fatal_signal_pending(current)))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
return __memcg_kmem_newpage_charge(gfp, memcg, order);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* memcg_kmem_uncharge_pages: uncharge pages from memcg
|
|
|
|
* @page: pointer to struct page being freed
|
|
|
|
* @order: allocation order.
|
|
|
|
*
|
|
|
|
* there is no need to specify memcg here, since it is embedded in page_cgroup
|
|
|
|
*/
|
|
|
|
static inline void
|
|
|
|
memcg_kmem_uncharge_pages(struct page *page, int order)
|
|
|
|
{
|
|
|
|
if (memcg_kmem_enabled())
|
|
|
|
__memcg_kmem_uncharge_pages(page, order);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* memcg_kmem_commit_charge: embeds correct memcg in a page
|
|
|
|
* @page: pointer to struct page recently allocated
|
|
|
|
* @memcg: the memcg structure we charged against
|
|
|
|
* @order: allocation order.
|
|
|
|
*
|
|
|
|
* Needs to be called after memcg_kmem_newpage_charge, regardless of success or
|
|
|
|
* failure of the allocation. if @page is NULL, this function will revert the
|
|
|
|
* charges. Otherwise, it will commit the memcg given by @memcg to the
|
|
|
|
* corresponding page_cgroup.
|
|
|
|
*/
|
|
|
|
static inline void
|
|
|
|
memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
|
|
|
|
{
|
|
|
|
if (memcg_kmem_enabled() && memcg)
|
|
|
|
__memcg_kmem_commit_charge(page, memcg, order);
|
|
|
|
}
|
|
|
|
|
2012-12-19 06:22:40 +08:00
|
|
|
/**
|
|
|
|
* memcg_kmem_get_cache: selects the correct per-memcg cache for allocation
|
|
|
|
* @cachep: the original global kmem cache
|
|
|
|
* @gfp: allocation flags.
|
|
|
|
*
|
|
|
|
* This function assumes that the task allocating, which determines the memcg
|
|
|
|
* in the page allocator, belongs to the same cgroup throughout the whole
|
|
|
|
* process. Misacounting can happen if the task calls memcg_kmem_get_cache()
|
|
|
|
* while belonging to a cgroup, and later on changes. This is considered
|
|
|
|
* acceptable, and should only happen upon task migration.
|
|
|
|
*
|
|
|
|
* Before the cache is created by the memcg core, there is also a possible
|
|
|
|
* imbalance: the task belongs to a memcg, but the cache being allocated from
|
|
|
|
* is the global cache, since the child cache is not yet guaranteed to be
|
|
|
|
* ready. This case is also fine, since in this case the GFP_KMEMCG will not be
|
|
|
|
* passed and the page allocator will not attempt any cgroup accounting.
|
|
|
|
*/
|
|
|
|
static __always_inline struct kmem_cache *
|
|
|
|
memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
|
|
|
|
{
|
|
|
|
if (!memcg_kmem_enabled())
|
|
|
|
return cachep;
|
|
|
|
if (gfp & __GFP_NOFAIL)
|
|
|
|
return cachep;
|
|
|
|
if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
|
|
|
|
return cachep;
|
|
|
|
if (unlikely(fatal_signal_pending(current)))
|
|
|
|
return cachep;
|
|
|
|
|
|
|
|
return __memcg_kmem_get_cache(cachep, gfp);
|
|
|
|
}
|
2012-12-19 06:21:56 +08:00
|
|
|
#else
|
2012-12-19 06:23:01 +08:00
|
|
|
#define for_each_memcg_cache_index(_idx) \
|
|
|
|
for (; NULL; )
|
|
|
|
|
2012-12-19 06:22:46 +08:00
|
|
|
static inline bool memcg_kmem_enabled(void)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2012-12-19 06:21:56 +08:00
|
|
|
static inline bool
|
|
|
|
memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
|
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void memcg_kmem_uncharge_pages(struct page *page, int order)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void
|
|
|
|
memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
|
|
|
|
{
|
|
|
|
}
|
2012-12-19 06:22:34 +08:00
|
|
|
|
|
|
|
static inline int memcg_cache_id(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2014-01-24 07:52:56 +08:00
|
|
|
static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg,
|
|
|
|
struct kmem_cache *s, struct kmem_cache *root_cache)
|
2012-12-19 06:22:34 +08:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2014-01-24 07:52:56 +08:00
|
|
|
static inline void memcg_free_cache_params(struct kmem_cache *s)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2012-12-19 06:22:34 +08:00
|
|
|
static inline void memcg_release_cache(struct kmem_cache *cachep)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void memcg_cache_list_add(struct mem_cgroup *memcg,
|
|
|
|
struct kmem_cache *s)
|
|
|
|
{
|
|
|
|
}
|
2012-12-19 06:22:40 +08:00
|
|
|
|
|
|
|
static inline struct kmem_cache *
|
|
|
|
memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
|
|
|
|
{
|
|
|
|
return cachep;
|
|
|
|
}
|
2012-12-19 06:22:55 +08:00
|
|
|
|
|
|
|
static inline void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
|
|
|
|
{
|
|
|
|
}
|
2012-12-19 06:21:56 +08:00
|
|
|
#endif /* CONFIG_MEMCG_KMEM */
|
2008-02-07 16:13:50 +08:00
|
|
|
#endif /* _LINUX_MEMCONTROL_H */
|
|
|
|
|