cgroups: subsystem module unloading

Provides support for unloading modular subsystems.

This patch adds a new function cgroup_unload_subsys which is to be used
for removing a loaded subsystem during module deletion.  Reference
counting of the subsystems' modules is moved from once (at load time) to
once per attached hierarchy (in parse_cgroupfs_options and
rebind_subsystems) (i.e., 0 or 1).

Signed-off-by: Ben Blum <bblum@andrew.cmu.edu>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Ben Blum 2010-03-10 15:22:09 -08:00 committed by Linus Torvalds
parent e6a1105ba0
commit cf5d5941fd
3 changed files with 148 additions and 28 deletions

View File

@ -489,8 +489,9 @@ Each subsystem should:
- define a cgroup_subsys object called <name>_subsys - define a cgroup_subsys object called <name>_subsys
If a subsystem can be compiled as a module, it should also have in its If a subsystem can be compiled as a module, it should also have in its
module initcall a call to cgroup_load_subsys(&its_subsys_struct). It module initcall a call to cgroup_load_subsys(), and in its exitcall a
should also set its_subsys.module = THIS_MODULE in its .c file. call to cgroup_unload_subsys(). It should also set its_subsys.module =
THIS_MODULE in its .c file.
Each subsystem may export the following methods. The only mandatory Each subsystem may export the following methods. The only mandatory
methods are create/destroy. Any others that are null are presumed to methods are create/destroy. Any others that are null are presumed to

View File

@ -38,6 +38,7 @@ extern void cgroup_exit(struct task_struct *p, int run_callbacks);
extern int cgroupstats_build(struct cgroupstats *stats, extern int cgroupstats_build(struct cgroupstats *stats,
struct dentry *dentry); struct dentry *dentry);
extern int cgroup_load_subsys(struct cgroup_subsys *ss); extern int cgroup_load_subsys(struct cgroup_subsys *ss);
extern void cgroup_unload_subsys(struct cgroup_subsys *ss);
extern const struct file_operations proc_cgroup_operations; extern const struct file_operations proc_cgroup_operations;
@ -271,7 +272,8 @@ struct css_set {
/* /*
* Set of subsystem states, one for each subsystem. This array * Set of subsystem states, one for each subsystem. This array
* is immutable after creation apart from the init_css_set * is immutable after creation apart from the init_css_set
* during subsystem registration (at boot time). * during subsystem registration (at boot time) and modular subsystem
* loading/unloading.
*/ */
struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];

View File

@ -894,7 +894,9 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
} }
/* /*
* Call with cgroup_mutex held. * Call with cgroup_mutex held. Drops reference counts on modules, including
* any duplicate ones that parse_cgroupfs_options took. If this function
* returns an error, no reference counts are touched.
*/ */
static int rebind_subsystems(struct cgroupfs_root *root, static int rebind_subsystems(struct cgroupfs_root *root,
unsigned long final_bits) unsigned long final_bits)
@ -950,6 +952,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
if (ss->bind) if (ss->bind)
ss->bind(ss, cgrp); ss->bind(ss, cgrp);
mutex_unlock(&ss->hierarchy_mutex); mutex_unlock(&ss->hierarchy_mutex);
/* refcount was already taken, and we're keeping it */
} else if (bit & removed_bits) { } else if (bit & removed_bits) {
/* We're removing this subsystem */ /* We're removing this subsystem */
BUG_ON(ss == NULL); BUG_ON(ss == NULL);
@ -963,10 +966,20 @@ static int rebind_subsystems(struct cgroupfs_root *root,
subsys[i]->root = &rootnode; subsys[i]->root = &rootnode;
list_move(&ss->sibling, &rootnode.subsys_list); list_move(&ss->sibling, &rootnode.subsys_list);
mutex_unlock(&ss->hierarchy_mutex); mutex_unlock(&ss->hierarchy_mutex);
/* subsystem is now free - drop reference on module */
module_put(ss->module);
} else if (bit & final_bits) { } else if (bit & final_bits) {
/* Subsystem state should already exist */ /* Subsystem state should already exist */
BUG_ON(ss == NULL); BUG_ON(ss == NULL);
BUG_ON(!cgrp->subsys[i]); BUG_ON(!cgrp->subsys[i]);
/*
* a refcount was taken, but we already had one, so
* drop the extra reference.
*/
module_put(ss->module);
#ifdef CONFIG_MODULE_UNLOAD
BUG_ON(ss->module && !module_refcount(ss->module));
#endif
} else { } else {
/* Subsystem state shouldn't exist */ /* Subsystem state shouldn't exist */
BUG_ON(cgrp->subsys[i]); BUG_ON(cgrp->subsys[i]);
@ -1010,13 +1023,16 @@ struct cgroup_sb_opts {
/* /*
* Convert a hierarchy specifier into a bitmask of subsystems and flags. Call * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
* with cgroup_mutex held to protect the subsys[] array. * with cgroup_mutex held to protect the subsys[] array. This function takes
* refcounts on subsystems to be used, unless it returns error, in which case
* no refcounts are taken.
*/ */
static int parse_cgroupfs_options(char *data, static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
struct cgroup_sb_opts *opts)
{ {
char *token, *o = data ?: "all"; char *token, *o = data ?: "all";
unsigned long mask = (unsigned long)-1; unsigned long mask = (unsigned long)-1;
int i;
bool module_pin_failed = false;
BUG_ON(!mutex_is_locked(&cgroup_mutex)); BUG_ON(!mutex_is_locked(&cgroup_mutex));
@ -1031,7 +1047,6 @@ static int parse_cgroupfs_options(char *data,
return -EINVAL; return -EINVAL;
if (!strcmp(token, "all")) { if (!strcmp(token, "all")) {
/* Add all non-disabled subsystems */ /* Add all non-disabled subsystems */
int i;
opts->subsys_bits = 0; opts->subsys_bits = 0;
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i]; struct cgroup_subsys *ss = subsys[i];
@ -1054,7 +1069,6 @@ static int parse_cgroupfs_options(char *data,
if (!opts->release_agent) if (!opts->release_agent)
return -ENOMEM; return -ENOMEM;
} else if (!strncmp(token, "name=", 5)) { } else if (!strncmp(token, "name=", 5)) {
int i;
const char *name = token + 5; const char *name = token + 5;
/* Can't specify an empty name */ /* Can't specify an empty name */
if (!strlen(name)) if (!strlen(name))
@ -1078,7 +1092,6 @@ static int parse_cgroupfs_options(char *data,
return -ENOMEM; return -ENOMEM;
} else { } else {
struct cgroup_subsys *ss; struct cgroup_subsys *ss;
int i;
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
ss = subsys[i]; ss = subsys[i];
if (ss == NULL) if (ss == NULL)
@ -1117,9 +1130,54 @@ static int parse_cgroupfs_options(char *data,
if (!opts->subsys_bits && !opts->name) if (!opts->subsys_bits && !opts->name)
return -EINVAL; return -EINVAL;
/*
* Grab references on all the modules we'll need, so the subsystems
* don't dance around before rebind_subsystems attaches them. This may
* take duplicate reference counts on a subsystem that's already used,
* but rebind_subsystems handles this case.
*/
for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
unsigned long bit = 1UL << i;
if (!(bit & opts->subsys_bits))
continue;
if (!try_module_get(subsys[i]->module)) {
module_pin_failed = true;
break;
}
}
if (module_pin_failed) {
/*
* oops, one of the modules was going away. this means that we
* raced with a module_delete call, and to the user this is
* essentially a "subsystem doesn't exist" case.
*/
for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
/* drop refcounts only on the ones we took */
unsigned long bit = 1UL << i;
if (!(bit & opts->subsys_bits))
continue;
module_put(subsys[i]->module);
}
return -ENOENT;
}
return 0; return 0;
} }
static void drop_parsed_module_refcounts(unsigned long subsys_bits)
{
int i;
for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
unsigned long bit = 1UL << i;
if (!(bit & subsys_bits))
continue;
module_put(subsys[i]->module);
}
}
static int cgroup_remount(struct super_block *sb, int *flags, char *data) static int cgroup_remount(struct super_block *sb, int *flags, char *data)
{ {
int ret = 0; int ret = 0;
@ -1136,21 +1194,19 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
if (ret) if (ret)
goto out_unlock; goto out_unlock;
/* Don't allow flags to change at remount */ /* Don't allow flags or name to change at remount */
if (opts.flags != root->flags) { if (opts.flags != root->flags ||
ret = -EINVAL; (opts.name && strcmp(opts.name, root->name))) {
goto out_unlock;
}
/* Don't allow name to change at remount */
if (opts.name && strcmp(opts.name, root->name)) {
ret = -EINVAL; ret = -EINVAL;
drop_parsed_module_refcounts(opts.subsys_bits);
goto out_unlock; goto out_unlock;
} }
ret = rebind_subsystems(root, opts.subsys_bits); ret = rebind_subsystems(root, opts.subsys_bits);
if (ret) if (ret) {
drop_parsed_module_refcounts(opts.subsys_bits);
goto out_unlock; goto out_unlock;
}
/* (re)populate subsystem files */ /* (re)populate subsystem files */
cgroup_populate_dir(cgrp); cgroup_populate_dir(cgrp);
@ -1349,7 +1405,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
new_root = cgroup_root_from_opts(&opts); new_root = cgroup_root_from_opts(&opts);
if (IS_ERR(new_root)) { if (IS_ERR(new_root)) {
ret = PTR_ERR(new_root); ret = PTR_ERR(new_root);
goto out_err; goto drop_modules;
} }
opts.new_root = new_root; opts.new_root = new_root;
@ -1358,7 +1414,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
if (IS_ERR(sb)) { if (IS_ERR(sb)) {
ret = PTR_ERR(sb); ret = PTR_ERR(sb);
cgroup_drop_root(opts.new_root); cgroup_drop_root(opts.new_root);
goto out_err; goto drop_modules;
} }
root = sb->s_fs_info; root = sb->s_fs_info;
@ -1414,6 +1470,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
free_cg_links(&tmp_cg_links); free_cg_links(&tmp_cg_links);
goto drop_new_super; goto drop_new_super;
} }
/*
* There must be no failure case after here, since rebinding
* takes care of subsystems' refcounts, which are explicitly
* dropped in the failure exit path.
*/
/* EBUSY should be the only error here */ /* EBUSY should be the only error here */
BUG_ON(ret); BUG_ON(ret);
@ -1452,6 +1513,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
* any) is not needed * any) is not needed
*/ */
cgroup_drop_root(opts.new_root); cgroup_drop_root(opts.new_root);
/* no subsys rebinding, so refcounts don't change */
drop_parsed_module_refcounts(opts.subsys_bits);
} }
simple_set_mnt(mnt, sb); simple_set_mnt(mnt, sb);
@ -1461,6 +1524,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
drop_new_super: drop_new_super:
deactivate_locked_super(sb); deactivate_locked_super(sb);
drop_modules:
drop_parsed_module_refcounts(opts.subsys_bits);
out_err: out_err:
kfree(opts.release_agent); kfree(opts.release_agent);
kfree(opts.name); kfree(opts.name);
@ -3422,19 +3487,71 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
ss->active = 1; ss->active = 1;
/*
* pin the subsystem's module so it doesn't go away. this shouldn't
* fail, since the module's initcall calls us.
* TODO: with module unloading, move this elsewhere
*/
BUG_ON(!try_module_get(ss->module));
/* success! */ /* success! */
mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_mutex);
return 0; return 0;
} }
EXPORT_SYMBOL_GPL(cgroup_load_subsys); EXPORT_SYMBOL_GPL(cgroup_load_subsys);
/**
* cgroup_unload_subsys: unload a modular subsystem
* @ss: the subsystem to unload
*
* This function should be called in a modular subsystem's exitcall. When this
* function is invoked, the refcount on the subsystem's module will be 0, so
* the subsystem will not be attached to any hierarchy.
*/
void cgroup_unload_subsys(struct cgroup_subsys *ss)
{
struct cg_cgroup_link *link;
struct hlist_head *hhead;
BUG_ON(ss->module == NULL);
/*
* we shouldn't be called if the subsystem is in use, and the use of
* try_module_get in parse_cgroupfs_options should ensure that it
* doesn't start being used while we're killing it off.
*/
BUG_ON(ss->root != &rootnode);
mutex_lock(&cgroup_mutex);
/* deassign the subsys_id */
BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
subsys[ss->subsys_id] = NULL;
/* remove subsystem from rootnode's list of subsystems */
list_del(&ss->sibling);
/*
* disentangle the css from all css_sets attached to the dummytop. as
* in loading, we need to pay our respects to the hashtable gods.
*/
write_lock(&css_set_lock);
list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
struct css_set *cg = link->cg;
hlist_del(&cg->hlist);
BUG_ON(!cg->subsys[ss->subsys_id]);
cg->subsys[ss->subsys_id] = NULL;
hhead = css_set_hash(cg->subsys);
hlist_add_head(&cg->hlist, hhead);
}
write_unlock(&css_set_lock);
/*
* remove subsystem's css from the dummytop and free it - need to free
* before marking as null because ss->destroy needs the cgrp->subsys
* pointer to find their state. note that this also takes care of
* freeing the css_id.
*/
ss->destroy(ss, dummytop);
dummytop->subsys[ss->subsys_id] = NULL;
mutex_unlock(&cgroup_mutex);
}
EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
/** /**
* cgroup_init_early - cgroup initialization at system boot * cgroup_init_early - cgroup initialization at system boot
* *