cgroup: introduce cgroup_tree_mutex

Currently cgroup uses combination of inode->i_mutex'es and
cgroup_mutex for synchronization.  With the scheduled kernfs
conversion, i_mutex'es will be removed.  Unfortunately, just using
cgroup_mutex isn't possible.  All kernfs file and syscall operations,
most of which require grabbing cgroup_mutex, will be called with
kernfs active ref held and, if we try to perform kernfs removals under
cgroup_mutex, it can deadlock as kernfs_remove() tries to drain the
target node.

Let's introduce a new outer mutex, cgroup_tree_mutex, which protects
stuff used during hierarchy changing operations - cftypes and all the
operations which may affect the cgroupfs.  It also covers css
association and iteration.  This allows cgroup_css(), for_each_css()
and other css iterators to be called under cgroup_tree_mutex.  The new
mutex will nest above both kernfs's active ref protection and
cgroup_mutex.  By protecting tree modifications with a separate outer
mutex, we can get rid of the forementioned deadlock condition.

Actual file additions and removals now require cgroup_tree_mutex
instead of cgroup_mutex.  Currently, cgroup_tree_mutex is never used
without cgroup_mutex; however, we'll soon add hierarchy modification
sections which are only protected by cgroup_tree_mutex.  In the
future, we might want to make the locking more granular by better
splitting the coverages of the two mutexes.  For now, this should do.

v2: Rebased on top of 0ab02ca8f8 ("cgroup: protect modifications to
    cgroup_idr with cgroup_mutex").

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
This commit is contained in:
Tejun Heo 2014-02-11 11:52:47 -05:00
parent 5a17f543ed
commit ace2bee813
1 changed files with 53 additions and 13 deletions

View File

@ -67,6 +67,15 @@
*/ */
#define CGROUP_PIDLIST_DESTROY_DELAY HZ #define CGROUP_PIDLIST_DESTROY_DELAY HZ
/*
* cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file
* creation/removal and hierarchy changing operations including cgroup
* creation, removal, css association and controller rebinding. This outer
* lock is needed mainly to resolve the circular dependency between kernfs
* active ref and cgroup_mutex. cgroup_tree_mutex nests above both.
*/
static DEFINE_MUTEX(cgroup_tree_mutex);
/* /*
* cgroup_mutex is the master lock. Any modification to cgroup or its * cgroup_mutex is the master lock. Any modification to cgroup or its
* hierarchy must be performed while holding it. * hierarchy must be performed while holding it.
@ -84,10 +93,11 @@ static DEFINE_MUTEX(cgroup_mutex);
*/ */
static DEFINE_SPINLOCK(release_agent_path_lock); static DEFINE_SPINLOCK(release_agent_path_lock);
#define cgroup_assert_mutex_or_rcu_locked() \ #define cgroup_assert_mutexes_or_rcu_locked() \
rcu_lockdep_assert(rcu_read_lock_held() || \ rcu_lockdep_assert(rcu_read_lock_held() || \
lockdep_is_held(&cgroup_tree_mutex) || \
lockdep_is_held(&cgroup_mutex), \ lockdep_is_held(&cgroup_mutex), \
"cgroup_mutex or RCU read lock required"); "cgroup_[tree_]mutex or RCU read lock required");
/* /*
* cgroup destruction makes heavy use of work items and there can be a lot * cgroup destruction makes heavy use of work items and there can be a lot
@ -179,7 +189,8 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
{ {
if (ss) if (ss)
return rcu_dereference_check(cgrp->subsys[ss->id], return rcu_dereference_check(cgrp->subsys[ss->id],
lockdep_is_held(&cgroup_mutex)); lockdep_is_held(&cgroup_tree_mutex) ||
lockdep_is_held(&cgroup_mutex));
else else
return &cgrp->dummy_css; return &cgrp->dummy_css;
} }
@ -235,6 +246,7 @@ static int notify_on_release(const struct cgroup *cgrp)
for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
if (!((css) = rcu_dereference_check( \ if (!((css) = rcu_dereference_check( \
(cgrp)->subsys[(ssid)], \ (cgrp)->subsys[(ssid)], \
lockdep_is_held(&cgroup_tree_mutex) || \
lockdep_is_held(&cgroup_mutex)))) { } \ lockdep_is_held(&cgroup_mutex)))) { } \
else else
@ -883,7 +895,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
struct cfent *cfe; struct cfent *cfe;
lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&cgroup_tree_mutex);
/* /*
* If we're doing cleanup due to failure of cgroup_create(), * If we're doing cleanup due to failure of cgroup_create(),
@ -948,7 +960,8 @@ static int rebind_subsystems(struct cgroupfs_root *root,
struct cgroup_subsys *ss; struct cgroup_subsys *ss;
int i, ret; int i, ret;
BUG_ON(!mutex_is_locked(&cgroup_mutex)); lockdep_assert_held(&cgroup_tree_mutex);
lockdep_assert_held(&cgroup_mutex);
/* Check that any added subsystems are currently free */ /* Check that any added subsystems are currently free */
for_each_subsys(ss, i) for_each_subsys(ss, i)
@ -1220,6 +1233,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
} }
mutex_lock(&cgrp->dentry->d_inode->i_mutex); mutex_lock(&cgrp->dentry->d_inode->i_mutex);
mutex_lock(&cgroup_tree_mutex);
mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_mutex);
/* See what subsystems are wanted */ /* See what subsystems are wanted */
@ -1263,6 +1277,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
kfree(opts.release_agent); kfree(opts.release_agent);
kfree(opts.name); kfree(opts.name);
mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgroup_tree_mutex);
mutex_unlock(&cgrp->dentry->d_inode->i_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
return ret; return ret;
} }
@ -1494,6 +1509,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
inode = sb->s_root->d_inode; inode = sb->s_root->d_inode;
mutex_lock(&inode->i_mutex); mutex_lock(&inode->i_mutex);
mutex_lock(&cgroup_tree_mutex);
mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_mutex);
ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
@ -1568,6 +1584,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
BUG_ON(root->number_of_cgroups != 1); BUG_ON(root->number_of_cgroups != 1);
mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgroup_tree_mutex);
mutex_unlock(&inode->i_mutex); mutex_unlock(&inode->i_mutex);
} else { } else {
/* /*
@ -1598,6 +1615,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
unlock_drop: unlock_drop:
cgroup_exit_root_id(root); cgroup_exit_root_id(root);
mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgroup_tree_mutex);
mutex_unlock(&inode->i_mutex); mutex_unlock(&inode->i_mutex);
drop_new_super: drop_new_super:
deactivate_locked_super(sb); deactivate_locked_super(sb);
@ -1620,6 +1638,7 @@ static void cgroup_kill_sb(struct super_block *sb)
BUG_ON(!list_empty(&cgrp->children)); BUG_ON(!list_empty(&cgrp->children));
mutex_lock(&cgrp->dentry->d_inode->i_mutex); mutex_lock(&cgrp->dentry->d_inode->i_mutex);
mutex_lock(&cgroup_tree_mutex);
mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_mutex);
/* Rebind all subsystems back to the default hierarchy */ /* Rebind all subsystems back to the default hierarchy */
@ -1650,6 +1669,7 @@ static void cgroup_kill_sb(struct super_block *sb)
cgroup_exit_root_id(root); cgroup_exit_root_id(root);
mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgroup_tree_mutex);
mutex_unlock(&cgrp->dentry->d_inode->i_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
simple_xattrs_free(&cgrp->xattrs); simple_xattrs_free(&cgrp->xattrs);
@ -2625,7 +2645,7 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
int ret; int ret;
lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&cgroup_tree_mutex);
for (cft = cfts; cft->name[0] != '\0'; cft++) { for (cft = cfts; cft->name[0] != '\0'; cft++) {
/* does cft->flags tell us to skip this file on @cgrp? */ /* does cft->flags tell us to skip this file on @cgrp? */
@ -2659,6 +2679,7 @@ static void cgroup_cfts_prepare(void)
* Instead, we use css_for_each_descendant_pre() and drop RCU read * Instead, we use css_for_each_descendant_pre() and drop RCU read
* lock before calling cgroup_addrm_files(). * lock before calling cgroup_addrm_files().
*/ */
mutex_lock(&cgroup_tree_mutex);
mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_mutex);
} }
@ -2679,6 +2700,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
if (!cfts || ss->root == &cgroup_dummy_root || if (!cfts || ss->root == &cgroup_dummy_root ||
!atomic_inc_not_zero(&sb->s_active)) { !atomic_inc_not_zero(&sb->s_active)) {
mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgroup_tree_mutex);
return 0; return 0;
} }
@ -2702,7 +2724,9 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
prev = cgrp->dentry; prev = cgrp->dentry;
mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgroup_tree_mutex);
mutex_lock(&inode->i_mutex); mutex_lock(&inode->i_mutex);
mutex_lock(&cgroup_tree_mutex);
mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_mutex);
if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
ret = cgroup_addrm_files(cgrp, cfts, is_add); ret = cgroup_addrm_files(cgrp, cfts, is_add);
@ -2711,6 +2735,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
break; break;
} }
mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgroup_tree_mutex);
dput(prev); dput(prev);
deactivate_super(sb); deactivate_super(sb);
return ret; return ret;
@ -2856,7 +2881,7 @@ css_next_child(struct cgroup_subsys_state *pos_css,
struct cgroup *cgrp = parent_css->cgroup; struct cgroup *cgrp = parent_css->cgroup;
struct cgroup *next; struct cgroup *next;
cgroup_assert_mutex_or_rcu_locked(); cgroup_assert_mutexes_or_rcu_locked();
/* /*
* @pos could already have been removed. Once a cgroup is removed, * @pos could already have been removed. Once a cgroup is removed,
@ -2914,7 +2939,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
{ {
struct cgroup_subsys_state *next; struct cgroup_subsys_state *next;
cgroup_assert_mutex_or_rcu_locked(); cgroup_assert_mutexes_or_rcu_locked();
/* if first iteration, visit @root */ /* if first iteration, visit @root */
if (!pos) if (!pos)
@ -2955,7 +2980,7 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos)
{ {
struct cgroup_subsys_state *last, *tmp; struct cgroup_subsys_state *last, *tmp;
cgroup_assert_mutex_or_rcu_locked(); cgroup_assert_mutexes_or_rcu_locked();
do { do {
last = pos; last = pos;
@ -3003,7 +3028,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
{ {
struct cgroup_subsys_state *next; struct cgroup_subsys_state *next;
cgroup_assert_mutex_or_rcu_locked(); cgroup_assert_mutexes_or_rcu_locked();
/* if first iteration, visit leftmost descendant which may be @root */ /* if first iteration, visit leftmost descendant which may be @root */
if (!pos) if (!pos)
@ -3977,6 +4002,7 @@ static int online_css(struct cgroup_subsys_state *css)
struct cgroup_subsys *ss = css->ss; struct cgroup_subsys *ss = css->ss;
int ret = 0; int ret = 0;
lockdep_assert_held(&cgroup_tree_mutex);
lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&cgroup_mutex);
if (ss->css_online) if (ss->css_online)
@ -3994,6 +4020,7 @@ static void offline_css(struct cgroup_subsys_state *css)
{ {
struct cgroup_subsys *ss = css->ss; struct cgroup_subsys *ss = css->ss;
lockdep_assert_held(&cgroup_tree_mutex);
lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&cgroup_mutex);
if (!(css->flags & CSS_ONLINE)) if (!(css->flags & CSS_ONLINE))
@ -4093,6 +4120,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
} }
rcu_assign_pointer(cgrp->name, name); rcu_assign_pointer(cgrp->name, name);
mutex_lock(&cgroup_tree_mutex);
/* /*
* Only live parents can have children. Note that the liveliness * Only live parents can have children. Note that the liveliness
* check isn't strictly necessary because cgroup_mkdir() and * check isn't strictly necessary because cgroup_mkdir() and
@ -4102,7 +4131,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
*/ */
if (!cgroup_lock_live_group(parent)) { if (!cgroup_lock_live_group(parent)) {
err = -ENODEV; err = -ENODEV;
goto err_free_name; goto err_unlock_tree;
} }
/* /*
@ -4176,6 +4205,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
} }
mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgroup_tree_mutex);
mutex_unlock(&cgrp->dentry->d_inode->i_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
return 0; return 0;
@ -4186,7 +4216,8 @@ err_free_id:
deactivate_super(sb); deactivate_super(sb);
err_unlock: err_unlock:
mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_mutex);
err_free_name: err_unlock_tree:
mutex_unlock(&cgroup_tree_mutex);
kfree(rcu_dereference_raw(cgrp->name)); kfree(rcu_dereference_raw(cgrp->name));
err_free_cgrp: err_free_cgrp:
kfree(cgrp); kfree(cgrp);
@ -4195,6 +4226,7 @@ err_free_cgrp:
err_destroy: err_destroy:
cgroup_destroy_locked(cgrp); cgroup_destroy_locked(cgrp);
mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgroup_tree_mutex);
mutex_unlock(&dentry->d_inode->i_mutex); mutex_unlock(&dentry->d_inode->i_mutex);
return err; return err;
} }
@ -4217,6 +4249,7 @@ static void css_killed_work_fn(struct work_struct *work)
container_of(work, struct cgroup_subsys_state, destroy_work); container_of(work, struct cgroup_subsys_state, destroy_work);
struct cgroup *cgrp = css->cgroup; struct cgroup *cgrp = css->cgroup;
mutex_lock(&cgroup_tree_mutex);
mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_mutex);
/* /*
@ -4234,6 +4267,7 @@ static void css_killed_work_fn(struct work_struct *work)
cgroup_destroy_css_killed(cgrp); cgroup_destroy_css_killed(cgrp);
mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgroup_tree_mutex);
/* /*
* Put the css refs from kill_css(). Each css holds an extra * Put the css refs from kill_css(). Each css holds an extra
@ -4321,6 +4355,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
int ssid; int ssid;
lockdep_assert_held(&d->d_inode->i_mutex); lockdep_assert_held(&d->d_inode->i_mutex);
lockdep_assert_held(&cgroup_tree_mutex);
lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&cgroup_mutex);
/* /*
@ -4407,6 +4442,7 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp)
struct cgroup *parent = cgrp->parent; struct cgroup *parent = cgrp->parent;
struct dentry *d = cgrp->dentry; struct dentry *d = cgrp->dentry;
lockdep_assert_held(&cgroup_tree_mutex);
lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&cgroup_mutex);
/* delete this cgroup from parent->children */ /* delete this cgroup from parent->children */
@ -4422,9 +4458,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
{ {
int ret; int ret;
mutex_lock(&cgroup_tree_mutex);
mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_mutex);
ret = cgroup_destroy_locked(dentry->d_fsdata); ret = cgroup_destroy_locked(dentry->d_fsdata);
mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgroup_tree_mutex);
return ret; return ret;
} }
@ -4454,6 +4492,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
mutex_lock(&cgroup_tree_mutex);
mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_mutex);
/* init base cftset */ /* init base cftset */
@ -4482,6 +4521,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
BUG_ON(online_css(css)); BUG_ON(online_css(css));
mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgroup_tree_mutex);
} }
/** /**
@ -5021,7 +5061,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
{ {
struct cgroup *cgrp; struct cgroup *cgrp;
cgroup_assert_mutex_or_rcu_locked(); cgroup_assert_mutexes_or_rcu_locked();
cgrp = idr_find(&ss->root->cgroup_idr, id); cgrp = idr_find(&ss->root->cgroup_idr, id);
if (cgrp) if (cgrp)