From b47f77b5a2243768d5cb1f143ce5e6c28baa271a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 23 Sep 2013 16:55:43 +0800 Subject: [PATCH 1/6] memcg: convert to use cgroup_is_descendant() This is a preparation to kill css_id. Signed-off-by: Li Zefan Acked-by: Michal Hocko Signed-off-by: Tejun Heo --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d5ff3ce13029..aa8363ef381c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1318,7 +1318,7 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, return true; if (!root_memcg->use_hierarchy || !memcg) return false; - return css_is_ancestor(&memcg->css, &root_memcg->css); + return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup); } static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, From 34c00c319ce7eddf79be0cd307acd5630addcbb8 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 23 Sep 2013 16:56:01 +0800 Subject: [PATCH 2/6] memcg: convert to use cgroup id Use cgroup id instead of css id. This is a preparation to kill css id. Note, as memcg treat 0 as an invalid id, while cgroup id starts with 0, we define memcg_id == cgroup_id + 1. Signed-off-by: Li Zefan Acked-by: Michal Hocko Signed-off-by: Tejun Heo --- mm/memcontrol.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index aa8363ef381c..404a13a09efa 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -488,6 +488,23 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) return (memcg == root_mem_cgroup); } +static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) +{ + /* + * The ID of the root cgroup is 0, but memcg treat 0 as an + * invalid ID, so we return (cgroup_id + 1). + */ + return memcg->css.cgroup->id + 1; +} + +static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) +{ + struct cgroup_subsys_state *css; + + css = css_from_id(id - 1, &mem_cgroup_subsys); + return mem_cgroup_from_css(css); +} + /* Writing them here to avoid exposing memcg's inner layout */ #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) @@ -2709,15 +2726,10 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, */ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) { - struct cgroup_subsys_state *css; - /* ID 0 is unused ID */ if (!id) return NULL; - css = css_lookup(&mem_cgroup_subsys, id); - if (!css) - return NULL; - return mem_cgroup_from_css(css); + return mem_cgroup_from_id(id); } struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) @@ -4232,7 +4244,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) * css_get() was called in uncharge(). */ if (do_swap_account && swapout && memcg) - swap_cgroup_record(ent, css_id(&memcg->css)); + swap_cgroup_record(ent, mem_cgroup_id(memcg)); } #endif @@ -4284,8 +4296,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, { unsigned short old_id, new_id; - old_id = css_id(&from->css); - new_id = css_id(&to->css); + old_id = mem_cgroup_id(from); + new_id = mem_cgroup_id(to); if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { mem_cgroup_swap_statistics(from, false); @@ -6325,7 +6337,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, } /* There is a swap entry and a page doesn't exist or isn't charged */ if (ent.val && !ret && - css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) { + mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { ret = MC_TARGET_SWAP; if (target) target->ent = ent; From 4219b2da206b3787dea4950ec53ccbc203f50e5e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 23 Sep 2013 16:56:29 +0800 Subject: [PATCH 3/6] memcg: fail to create cgroup if the cgroup id is too big memcg requires the cgroup id to be smaller than 65536. This is a preparation to kill css id. Signed-off-by: Li Zefan Acked-by: Michal Hocko Signed-off-by: Tejun Heo --- mm/memcontrol.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 404a13a09efa..c14039575608 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -488,6 +488,12 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) return (memcg == root_mem_cgroup); } +/* + * We restrict the id in the range of [1, 65535], so it can fit into + * an unsigned short. + */ +#define MEM_CGROUP_ID_MAX USHRT_MAX + static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { /* @@ -6059,6 +6065,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css)); int error = 0; + if (css->cgroup->id > MEM_CGROUP_ID_MAX) + return -ENOSPC; + if (!parent) return 0; From b862783594847a1087cd18c8aeac5727608c3c67 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 23 Sep 2013 16:56:47 +0800 Subject: [PATCH 4/6] memcg: stop using css id Now memcg uses cgroup id instead of css id. Update some comments and set mem_cgroup_subsys->use_id to 0. Signed-off-by: Li Zefan Acked-by: Michal Hocko Signed-off-by: Tejun Heo --- mm/memcontrol.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c14039575608..65a46eff5c3d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -582,16 +582,11 @@ static void disarm_sock_keys(struct mem_cgroup *memcg) #ifdef CONFIG_MEMCG_KMEM /* * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. - * There are two main reasons for not using the css_id for this: - * 1) this works better in sparse environments, where we have a lot of memcgs, - * but only a few kmem-limited. Or also, if we have, for instance, 200 - * memcgs, and none but the 200th is kmem-limited, we'd have to have a - * 200 entry array for that. - * - * 2) In order not to violate the cgroup API, we would like to do all memory - * allocation in ->create(). At that point, we haven't yet allocated the - * css_id. Having a separate index prevents us from messing with the cgroup - * core for this + * The main reason for not using cgroup id for this: + * this works better in sparse environments, where we have a lot of memcgs, + * but only a few kmem-limited. Or also, if we have, for instance, 200 + * memcgs, and none but the 200th is kmem-limited, we'd have to have a + * 200 entry array for that. * * The current size of the caches array is stored in * memcg_limited_groups_array_size. It will double each time we have to @@ -606,14 +601,14 @@ int memcg_limited_groups_array_size; * cgroups is a reasonable guess. In the future, it could be a parameter or * tunable, but that is strictly not necessary. * - * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get + * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get * this constant directly from cgroup, but it is understandable that this is * better kept as an internal representation in cgroup.c. In any case, the - * css_id space is not getting any smaller, and we don't have to necessarily + * cgrp_id space is not getting any smaller, and we don't have to necessarily * increase ours as well if it increases. */ #define MEMCG_CACHES_MIN_SIZE 4 -#define MEMCG_CACHES_MAX_SIZE 65535 +#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX /* * A lot of the calls to the cache allocation functions are expected to be @@ -5984,8 +5979,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) int node; size_t size = memcg_size(); - free_css_id(&mem_cgroup_subsys, &memcg->css); - for_each_node(node) free_mem_cgroup_per_zone_info(memcg, node); @@ -6766,7 +6759,6 @@ struct cgroup_subsys mem_cgroup_subsys = { .bind = mem_cgroup_bind, .base_cftypes = mem_cgroup_files, .early_init = 0, - .use_id = 1, }; #ifdef CONFIG_MEMCG_SWAP From 2ff2a7d03bbe472ed44a8380dbdbea490d81c59d Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 23 Sep 2013 16:57:03 +0800 Subject: [PATCH 5/6] cgroup: kill css_id The only user of css_id was memcg, and it has been convered to use cgroup->id, so kill css_id. Signed-off-by: Li Zefan Reviewed-by: Michal Hocko Acked-by: Tejun Heo Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 37 ------ kernel/cgroup.c | 248 +---------------------------------------- 2 files changed, 1 insertion(+), 284 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 3561d305b1e0..39c1d9469677 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -612,11 +612,6 @@ struct cgroup_subsys { int subsys_id; int disabled; int early_init; - /* - * True if this subsys uses ID. ID is not available before cgroup_init() - * (not available in early_init time.) - */ - bool use_id; /* * If %false, this subsystem is properly hierarchical - @@ -642,9 +637,6 @@ struct cgroup_subsys { */ struct cgroupfs_root *root; struct list_head sibling; - /* used when use_id == true */ - struct idr idr; - spinlock_t id_lock; /* list of cftype_sets */ struct list_head cftsets; @@ -875,35 +867,6 @@ int css_scan_tasks(struct cgroup_subsys_state *css, int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); -/* - * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works - * if cgroup_subsys.use_id == true. It can be used for looking up and scanning. - * CSS ID is assigned at cgroup allocation (create) automatically - * and removed when subsys calls free_css_id() function. This is because - * the lifetime of cgroup_subsys_state is subsys's matter. - * - * Looking up and scanning function should be called under rcu_read_lock(). - * Taking cgroup_mutex is not necessary for following calls. - * But the css returned by this routine can be "not populated yet" or "being - * destroyed". The caller should check css and cgroup's status. - */ - -/* - * Typically Called at ->destroy(), or somewhere the subsys frees - * cgroup_subsys_state. - */ -void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css); - -/* Find a cgroup_subsys_state which has given ID */ - -struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id); - -/* Returns true if root is ancestor of cg */ -bool css_is_ancestor(struct cgroup_subsys_state *cg, - const struct cgroup_subsys_state *root); - -/* Get id and depth of css */ -unsigned short css_id(struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, struct cgroup_subsys *ss); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2418b6e71a85..a5629f1df13a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -124,38 +124,6 @@ struct cfent { struct simple_xattrs xattrs; }; -/* - * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when - * cgroup_subsys->use_id != 0. - */ -#define CSS_ID_MAX (65535) -struct css_id { - /* - * The css to which this ID points. This pointer is set to valid value - * after cgroup is populated. If cgroup is removed, this will be NULL. - * This pointer is expected to be RCU-safe because destroy() - * is called after synchronize_rcu(). But for safe use, css_tryget() - * should be used for avoiding race. - */ - struct cgroup_subsys_state __rcu *css; - /* - * ID of this css. - */ - unsigned short id; - /* - * Depth in hierarchy which this ID belongs to. - */ - unsigned short depth; - /* - * ID is freed by RCU. (and lookup routine is RCU safe.) - */ - struct rcu_head rcu_head; - /* - * Hierarchy of CSS ID belongs to. - */ - unsigned short stack[0]; /* Array of Length (depth+1) */ -}; - /* * cgroup_event represents events which userspace want to receive. */ @@ -387,9 +355,6 @@ struct cgrp_cset_link { static struct css_set init_css_set; static struct cgrp_cset_link init_cgrp_cset_link; -static int cgroup_init_idr(struct cgroup_subsys *ss, - struct cgroup_subsys_state *css); - /* * css_set_lock protects the list of css_set objects, and the chain of * tasks off each css_set. Nests outside task->alloc_lock due to @@ -841,8 +806,6 @@ static struct backing_dev_info cgroup_backing_dev_info = { .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; -static int alloc_css_id(struct cgroup_subsys_state *child_css); - static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) { struct inode *inode = new_inode(sb); @@ -4242,21 +4205,6 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) goto err; } } - - /* This cgroup is ready now */ - for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); - struct css_id *id = rcu_dereference_protected(css->id, true); - - /* - * Update id->css pointer and make this css visible from - * CSS ID functions. This pointer will be dereferened - * from RCU-read-side without locks. - */ - if (id) - rcu_assign_pointer(id->css, css); - } - return 0; err: cgroup_clear_dir(cgrp, subsys_mask); @@ -4325,7 +4273,6 @@ static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, css->cgroup = cgrp; css->ss = ss; css->flags = 0; - css->id = NULL; if (cgrp->parent) css->parent = cgroup_css(cgrp->parent, ss); @@ -4457,12 +4404,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, goto err_free_all; init_css(css, ss, cgrp); - - if (ss->use_id) { - err = alloc_css_id(css); - if (err) - goto err_free_all; - } } /* @@ -4927,12 +4868,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) /* our new subsystem will be attached to the dummy hierarchy. */ init_css(css, ss, cgroup_dummy_top); - /* init_idr must be after init_css() because it sets css->id. */ - if (ss->use_id) { - ret = cgroup_init_idr(ss, css); - if (ret) - goto err_unload; - } /* * Now we need to entangle the css into the existing css_sets. unlike @@ -4998,9 +4933,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) offline_css(cgroup_css(cgroup_dummy_top, ss)); - if (ss->use_id) - idr_destroy(&ss->idr); - /* deassign the subsys_id */ cgroup_subsys[ss->subsys_id] = NULL; @@ -5027,8 +4959,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) /* * remove subsystem's css from the cgroup_dummy_top and free it - * need to free before marking as null because ss->css_free needs - * the cgrp->subsys pointer to find their state. note that this - * also takes care of freeing the css_id. + * the cgrp->subsys pointer to find their state. */ ss->css_free(cgroup_css(cgroup_dummy_top, ss)); RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); @@ -5099,8 +5030,6 @@ int __init cgroup_init(void) for_each_builtin_subsys(ss, i) { if (!ss->early_init) cgroup_init_subsys(ss); - if (ss->use_id) - cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]); } /* allocate id for the dummy hierarchy */ @@ -5520,181 +5449,6 @@ static int __init cgroup_disable(char *str) } __setup("cgroup_disable=", cgroup_disable); -/* - * Functons for CSS ID. - */ - -/* to get ID other than 0, this should be called when !cgroup_is_dead() */ -unsigned short css_id(struct cgroup_subsys_state *css) -{ - struct css_id *cssid; - - /* - * This css_id() can return correct value when somone has refcnt - * on this or this is under rcu_read_lock(). Once css->id is allocated, - * it's unchanged until freed. - */ - cssid = rcu_dereference_raw(css->id); - - if (cssid) - return cssid->id; - return 0; -} -EXPORT_SYMBOL_GPL(css_id); - -/** - * css_is_ancestor - test "root" css is an ancestor of "child" - * @child: the css to be tested. - * @root: the css supporsed to be an ancestor of the child. - * - * Returns true if "root" is an ancestor of "child" in its hierarchy. Because - * this function reads css->id, the caller must hold rcu_read_lock(). - * But, considering usual usage, the csses should be valid objects after test. - * Assuming that the caller will do some action to the child if this returns - * returns true, the caller must take "child";s reference count. - * If "child" is valid object and this returns true, "root" is valid, too. - */ - -bool css_is_ancestor(struct cgroup_subsys_state *child, - const struct cgroup_subsys_state *root) -{ - struct css_id *child_id; - struct css_id *root_id; - - child_id = rcu_dereference(child->id); - if (!child_id) - return false; - root_id = rcu_dereference(root->id); - if (!root_id) - return false; - if (child_id->depth < root_id->depth) - return false; - if (child_id->stack[root_id->depth] != root_id->id) - return false; - return true; -} - -void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) -{ - struct css_id *id = rcu_dereference_protected(css->id, true); - - /* When this is called before css_id initialization, id can be NULL */ - if (!id) - return; - - BUG_ON(!ss->use_id); - - rcu_assign_pointer(id->css, NULL); - rcu_assign_pointer(css->id, NULL); - spin_lock(&ss->id_lock); - idr_remove(&ss->idr, id->id); - spin_unlock(&ss->id_lock); - kfree_rcu(id, rcu_head); -} -EXPORT_SYMBOL_GPL(free_css_id); - -/* - * This is called by init or create(). Then, calls to this function are - * always serialized (By cgroup_mutex() at create()). - */ - -static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) -{ - struct css_id *newid; - int ret, size; - - BUG_ON(!ss->use_id); - - size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1); - newid = kzalloc(size, GFP_KERNEL); - if (!newid) - return ERR_PTR(-ENOMEM); - - idr_preload(GFP_KERNEL); - spin_lock(&ss->id_lock); - /* Don't use 0. allocates an ID of 1-65535 */ - ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT); - spin_unlock(&ss->id_lock); - idr_preload_end(); - - /* Returns error when there are no free spaces for new ID.*/ - if (ret < 0) - goto err_out; - - newid->id = ret; - newid->depth = depth; - return newid; -err_out: - kfree(newid); - return ERR_PTR(ret); - -} - -static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, - struct cgroup_subsys_state *rootcss) -{ - struct css_id *newid; - - spin_lock_init(&ss->id_lock); - idr_init(&ss->idr); - - newid = get_new_cssid(ss, 0); - if (IS_ERR(newid)) - return PTR_ERR(newid); - - newid->stack[0] = newid->id; - RCU_INIT_POINTER(newid->css, rootcss); - RCU_INIT_POINTER(rootcss->id, newid); - return 0; -} - -static int alloc_css_id(struct cgroup_subsys_state *child_css) -{ - struct cgroup_subsys_state *parent_css = css_parent(child_css); - struct css_id *child_id, *parent_id; - int i, depth; - - parent_id = rcu_dereference_protected(parent_css->id, true); - depth = parent_id->depth + 1; - - child_id = get_new_cssid(child_css->ss, depth); - if (IS_ERR(child_id)) - return PTR_ERR(child_id); - - for (i = 0; i < depth; i++) - child_id->stack[i] = parent_id->stack[i]; - child_id->stack[depth] = child_id->id; - /* - * child_id->css pointer will be set after this cgroup is available - * see cgroup_populate_dir() - */ - rcu_assign_pointer(child_css->id, child_id); - - return 0; -} - -/** - * css_lookup - lookup css by id - * @ss: cgroup subsys to be looked into. - * @id: the id - * - * Returns pointer to cgroup_subsys_state if there is valid one with id. - * NULL if not. Should be called under rcu_read_lock() - */ -struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) -{ - struct css_id *cssid = NULL; - - BUG_ON(!ss->use_id); - cssid = idr_find(&ss->idr, id); - - if (unlikely(!cssid)) - return NULL; - - return rcu_dereference(cssid->css); -} -EXPORT_SYMBOL_GPL(css_lookup); - /** * css_from_dir - get corresponding css from the dentry of a cgroup dir * @dentry: directory dentry of interest From 73ba353471e0b692f398f3d63018b7f46ccf1d3e Mon Sep 17 00:00:00 2001 From: Serge Hallyn Date: Wed, 23 Oct 2013 01:34:00 +0200 Subject: [PATCH 6/6] device_cgroup: remove can_attach It is really only wanting to duplicate a check which is already done by the cgroup subsystem. With this patch, user jdoe still cannot move pid 1 into a devices cgroup he owns, but now he can move his own other tasks into devices cgroups. Signed-off-by: Serge Hallyn Signed-off-by: Tejun Heo Cc: Aristeu Rozanski --- security/device_cgroup.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/security/device_cgroup.c b/security/device_cgroup.c index c123628d3f84..7c2a0a71049e 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -63,16 +63,6 @@ static inline struct dev_cgroup *task_devcgroup(struct task_struct *task) struct cgroup_subsys devices_subsys; -static int devcgroup_can_attach(struct cgroup_subsys_state *new_css, - struct cgroup_taskset *set) -{ - struct task_struct *task = cgroup_taskset_first(set); - - if (current != task && !capable(CAP_SYS_ADMIN)) - return -EPERM; - return 0; -} - /* * called under devcgroup_mutex */ @@ -697,7 +687,6 @@ static struct cftype dev_cgroup_files[] = { struct cgroup_subsys devices_subsys = { .name = "devices", - .can_attach = devcgroup_can_attach, .css_alloc = devcgroup_css_alloc, .css_free = devcgroup_css_free, .css_online = devcgroup_online,