diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index be81fafae11f..565c8034e6c8 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -16,6 +16,7 @@ #include #include #include +#include #ifdef CONFIG_CGROUPS @@ -76,12 +77,16 @@ struct cgroup_subsys_state { unsigned long flags; /* ID for this css, if possible */ struct css_id __rcu *id; + + /* Used to put @cgroup->dentry on the last css_put() */ + struct work_struct dput_work; }; /* bits in struct cgroup_subsys_state flags field */ enum { CSS_ROOT, /* This CSS is the root of the subsystem */ CSS_REMOVED, /* This CSS is dead */ + CSS_CLEAR_CSS_REFS, /* @ss->__DEPRECATED_clear_css_refs */ }; /* Caller must verify that the css is not for root cgroup */ @@ -480,6 +485,18 @@ struct cgroup_subsys { * (not available in early_init time.) */ bool use_id; + + /* + * If %true, cgroup removal will try to clear css refs by retrying + * ss->pre_destroy() until there's no css ref left. This behavior + * is strictly for backward compatibility and will be removed as + * soon as the current user (memcg) is updated. + * + * If %false, ss->pre_destroy() can't fail and cgroup removal won't + * wait for css refs to drop to zero before proceeding. + */ + bool __DEPRECATED_clear_css_refs; + #define MAX_CGROUP_TYPE_NAMELEN 32 const char *name; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2eade5186604..2905977e0f33 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -854,12 +854,17 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp) struct cgroup_subsys *ss; int ret = 0; - for_each_subsys(cgrp->root, ss) - if (ss->pre_destroy) { - ret = ss->pre_destroy(cgrp); - if (ret) - break; + for_each_subsys(cgrp->root, ss) { + if (!ss->pre_destroy) + continue; + + ret = ss->pre_destroy(cgrp); + if (ret) { + /* ->pre_destroy() failure is being deprecated */ + WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs); + break; } + } return ret; } @@ -3859,6 +3864,14 @@ static int cgroup_populate_dir(struct cgroup *cgrp) return 0; } +static void css_dput_fn(struct work_struct *work) +{ + struct cgroup_subsys_state *css = + container_of(work, struct cgroup_subsys_state, dput_work); + + dput(css->cgroup->dentry); +} + static void init_cgroup_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, struct cgroup *cgrp) @@ -3871,6 +3884,16 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, set_bit(CSS_ROOT, &css->flags); BUG_ON(cgrp->subsys[ss->subsys_id]); cgrp->subsys[ss->subsys_id] = css; + + /* + * If !clear_css_refs, css holds an extra ref to @cgrp->dentry + * which is put on the last css_put(). dput() requires process + * context, which css_put() may be called without. @css->dput_work + * will be used to invoke dput() asynchronously from css_put(). + */ + INIT_WORK(&css->dput_work, css_dput_fn); + if (ss->__DEPRECATED_clear_css_refs) + set_bit(CSS_CLEAR_CSS_REFS, &css->flags); } static void cgroup_lock_hierarchy(struct cgroupfs_root *root) @@ -3973,6 +3996,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (err < 0) goto err_remove; + /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */ + for_each_subsys(root, ss) + if (!ss->__DEPRECATED_clear_css_refs) + dget(dentry); + /* The cgroup directory was pre-locked for us */ BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); @@ -4062,8 +4090,24 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) * Atomically mark all (or else none) of the cgroup's CSS objects as * CSS_REMOVED. Return true on success, or false if the cgroup has * busy subsystems. Call with cgroup_mutex held + * + * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or + * not, cgroup removal behaves differently. + * + * If clear is set, css refcnt for the subsystem should be zero before + * cgroup removal can be committed. This is implemented by + * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be + * called multiple times until all css refcnts reach zero and is allowed to + * veto removal on any invocation. This behavior is deprecated and will be + * removed as soon as the existing user (memcg) is updated. + * + * If clear is not set, each css holds an extra reference to the cgroup's + * dentry and cgroup removal proceeds regardless of css refs. + * ->pre_destroy() will be called at least once and is not allowed to fail. + * On the last put of each css, whenever that may be, the extra dentry ref + * is put so that dentry destruction happens only after all css's are + * released. */ - static int cgroup_clear_css_refs(struct cgroup *cgrp) { struct cgroup_subsys *ss; @@ -4074,14 +4118,17 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp) /* * Block new css_tryget() by deactivating refcnt. If all refcnts - * were 1 at the moment of deactivation, we succeeded. + * for subsystems w/ clear_css_refs set were 1 at the moment of + * deactivation, we succeeded. */ for_each_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; WARN_ON(atomic_read(&css->refcnt) < 0); atomic_add(CSS_DEACT_BIAS, &css->refcnt); - failed |= css_refcnt(css) != 1; + + if (ss->__DEPRECATED_clear_css_refs) + failed |= css_refcnt(css) != 1; } /* @@ -4917,12 +4964,18 @@ void __css_put(struct cgroup_subsys_state *css) rcu_read_lock(); atomic_dec(&css->refcnt); - if (css_refcnt(css) == 1) { + switch (css_refcnt(css)) { + case 1: if (notify_on_release(cgrp)) { set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } cgroup_wakeup_rmdir_waiter(cgrp); + break; + case 0: + if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) + schedule_work(&css->dput_work); + break; } rcu_read_unlock(); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bef114258bbd..d28359cd6b55 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5635,6 +5635,7 @@ struct cgroup_subsys mem_cgroup_subsys = { .base_cftypes = mem_cgroup_files, .early_init = 0, .use_id = 1, + .__DEPRECATED_clear_css_refs = true, }; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP