Merge branch 'for-4.5-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup fixes from Tejun Heo: - The destruction path of cgroup objects are asynchronous and multi-staged and some of them ended up destroying parents before children leading to failures in cpu and memory controllers. Ensure that parents are always destroyed after children. - cpuset mm node migration was performed synchronously while holding threadgroup and cgroup mutexes and the recent threadgroup locking update resulted in a possible deadlock. The migration is best effort and shouldn't have been performed under those locks to begin with. Made asynchronous. - Minor documentation fix. * 'for-4.5-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: Documentation: cgroup: Fix 'cgroup-legacy' -> 'cgroup-v1' cgroup: make sure a parent css isn't freed before its children cgroup: make sure a parent css isn't offlined before its children cpuset: make mm migration asynchronous
This commit is contained in:
commit
fb0dc5f129
|
@ -7,7 +7,7 @@ This is the authoritative documentation on the design, interface and
|
|||
conventions of cgroup v2. It describes all userland-visible aspects
|
||||
of cgroup including core and specific controller behaviors. All
|
||||
future changes must be reflected in this document. Documentation for
|
||||
v1 is available under Documentation/cgroup-legacy/.
|
||||
v1 is available under Documentation/cgroup-v1/.
|
||||
|
||||
CONTENTS
|
||||
|
||||
|
|
|
@ -127,6 +127,12 @@ struct cgroup_subsys_state {
|
|||
*/
|
||||
u64 serial_nr;
|
||||
|
||||
/*
|
||||
* Incremented by online self and children. Used to guarantee that
|
||||
* parents are not offlined before their children.
|
||||
*/
|
||||
atomic_t online_cnt;
|
||||
|
||||
/* percpu_ref killing and RCU release */
|
||||
struct rcu_head rcu_head;
|
||||
struct work_struct destroy_work;
|
||||
|
|
|
@ -137,6 +137,8 @@ static inline void set_mems_allowed(nodemask_t nodemask)
|
|||
task_unlock(current);
|
||||
}
|
||||
|
||||
extern void cpuset_post_attach_flush(void);
|
||||
|
||||
#else /* !CONFIG_CPUSETS */
|
||||
|
||||
static inline bool cpusets_enabled(void) { return false; }
|
||||
|
@ -243,6 +245,10 @@ static inline bool read_mems_allowed_retry(unsigned int seq)
|
|||
return false;
|
||||
}
|
||||
|
||||
static inline void cpuset_post_attach_flush(void)
|
||||
{
|
||||
}
|
||||
|
||||
#endif /* !CONFIG_CPUSETS */
|
||||
|
||||
#endif /* _LINUX_CPUSET_H */
|
||||
|
|
|
@ -58,6 +58,7 @@
|
|||
#include <linux/kthread.h>
|
||||
#include <linux/delay.h>
|
||||
#include <linux/atomic.h>
|
||||
#include <linux/cpuset.h>
|
||||
#include <net/sock.h>
|
||||
|
||||
/*
|
||||
|
@ -2739,6 +2740,7 @@ out_unlock_rcu:
|
|||
out_unlock_threadgroup:
|
||||
percpu_up_write(&cgroup_threadgroup_rwsem);
|
||||
cgroup_kn_unlock(of->kn);
|
||||
cpuset_post_attach_flush();
|
||||
return ret ?: nbytes;
|
||||
}
|
||||
|
||||
|
@ -4655,14 +4657,15 @@ static void css_free_work_fn(struct work_struct *work)
|
|||
|
||||
if (ss) {
|
||||
/* css free path */
|
||||
struct cgroup_subsys_state *parent = css->parent;
|
||||
int id = css->id;
|
||||
|
||||
if (css->parent)
|
||||
css_put(css->parent);
|
||||
|
||||
ss->css_free(css);
|
||||
cgroup_idr_remove(&ss->css_idr, id);
|
||||
cgroup_put(cgrp);
|
||||
|
||||
if (parent)
|
||||
css_put(parent);
|
||||
} else {
|
||||
/* cgroup free path */
|
||||
atomic_dec(&cgrp->root->nr_cgrps);
|
||||
|
@ -4758,6 +4761,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
|
|||
INIT_LIST_HEAD(&css->sibling);
|
||||
INIT_LIST_HEAD(&css->children);
|
||||
css->serial_nr = css_serial_nr_next++;
|
||||
atomic_set(&css->online_cnt, 0);
|
||||
|
||||
if (cgroup_parent(cgrp)) {
|
||||
css->parent = cgroup_css(cgroup_parent(cgrp), ss);
|
||||
|
@ -4780,6 +4784,10 @@ static int online_css(struct cgroup_subsys_state *css)
|
|||
if (!ret) {
|
||||
css->flags |= CSS_ONLINE;
|
||||
rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
|
||||
|
||||
atomic_inc(&css->online_cnt);
|
||||
if (css->parent)
|
||||
atomic_inc(&css->parent->online_cnt);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
@ -5017,10 +5025,15 @@ static void css_killed_work_fn(struct work_struct *work)
|
|||
container_of(work, struct cgroup_subsys_state, destroy_work);
|
||||
|
||||
mutex_lock(&cgroup_mutex);
|
||||
offline_css(css);
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
|
||||
css_put(css);
|
||||
do {
|
||||
offline_css(css);
|
||||
css_put(css);
|
||||
/* @css can't go away while we're holding cgroup_mutex */
|
||||
css = css->parent;
|
||||
} while (css && atomic_dec_and_test(&css->online_cnt));
|
||||
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
}
|
||||
|
||||
/* css kill confirmation processing requires process context, bounce */
|
||||
|
@ -5029,8 +5042,10 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
|
|||
struct cgroup_subsys_state *css =
|
||||
container_of(ref, struct cgroup_subsys_state, refcnt);
|
||||
|
||||
INIT_WORK(&css->destroy_work, css_killed_work_fn);
|
||||
queue_work(cgroup_destroy_wq, &css->destroy_work);
|
||||
if (atomic_dec_and_test(&css->online_cnt)) {
|
||||
INIT_WORK(&css->destroy_work, css_killed_work_fn);
|
||||
queue_work(cgroup_destroy_wq, &css->destroy_work);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -287,6 +287,8 @@ static struct cpuset top_cpuset = {
|
|||
static DEFINE_MUTEX(cpuset_mutex);
|
||||
static DEFINE_SPINLOCK(callback_lock);
|
||||
|
||||
static struct workqueue_struct *cpuset_migrate_mm_wq;
|
||||
|
||||
/*
|
||||
* CPU / memory hotplug is handled asynchronously.
|
||||
*/
|
||||
|
@ -972,31 +974,51 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
|
|||
}
|
||||
|
||||
/*
|
||||
* cpuset_migrate_mm
|
||||
*
|
||||
* Migrate memory region from one set of nodes to another.
|
||||
*
|
||||
* Temporarilly set tasks mems_allowed to target nodes of migration,
|
||||
* so that the migration code can allocate pages on these nodes.
|
||||
*
|
||||
* While the mm_struct we are migrating is typically from some
|
||||
* other task, the task_struct mems_allowed that we are hacking
|
||||
* is for our current task, which must allocate new pages for that
|
||||
* migrating memory region.
|
||||
* Migrate memory region from one set of nodes to another. This is
|
||||
* performed asynchronously as it can be called from process migration path
|
||||
* holding locks involved in process management. All mm migrations are
|
||||
* performed in the queued order and can be waited for by flushing
|
||||
* cpuset_migrate_mm_wq.
|
||||
*/
|
||||
|
||||
struct cpuset_migrate_mm_work {
|
||||
struct work_struct work;
|
||||
struct mm_struct *mm;
|
||||
nodemask_t from;
|
||||
nodemask_t to;
|
||||
};
|
||||
|
||||
static void cpuset_migrate_mm_workfn(struct work_struct *work)
|
||||
{
|
||||
struct cpuset_migrate_mm_work *mwork =
|
||||
container_of(work, struct cpuset_migrate_mm_work, work);
|
||||
|
||||
/* on a wq worker, no need to worry about %current's mems_allowed */
|
||||
do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
|
||||
mmput(mwork->mm);
|
||||
kfree(mwork);
|
||||
}
|
||||
|
||||
static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
|
||||
const nodemask_t *to)
|
||||
{
|
||||
struct task_struct *tsk = current;
|
||||
struct cpuset_migrate_mm_work *mwork;
|
||||
|
||||
tsk->mems_allowed = *to;
|
||||
mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
|
||||
if (mwork) {
|
||||
mwork->mm = mm;
|
||||
mwork->from = *from;
|
||||
mwork->to = *to;
|
||||
INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
|
||||
queue_work(cpuset_migrate_mm_wq, &mwork->work);
|
||||
} else {
|
||||
mmput(mm);
|
||||
}
|
||||
}
|
||||
|
||||
do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
|
||||
|
||||
rcu_read_lock();
|
||||
guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
|
||||
rcu_read_unlock();
|
||||
void cpuset_post_attach_flush(void)
|
||||
{
|
||||
flush_workqueue(cpuset_migrate_mm_wq);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1097,7 +1119,8 @@ static void update_tasks_nodemask(struct cpuset *cs)
|
|||
mpol_rebind_mm(mm, &cs->mems_allowed);
|
||||
if (migrate)
|
||||
cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
|
||||
mmput(mm);
|
||||
else
|
||||
mmput(mm);
|
||||
}
|
||||
css_task_iter_end(&it);
|
||||
|
||||
|
@ -1545,11 +1568,11 @@ static void cpuset_attach(struct cgroup_taskset *tset)
|
|||
* @old_mems_allowed is the right nodesets that we
|
||||
* migrate mm from.
|
||||
*/
|
||||
if (is_memory_migrate(cs)) {
|
||||
if (is_memory_migrate(cs))
|
||||
cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
|
||||
&cpuset_attach_nodemask_to);
|
||||
}
|
||||
mmput(mm);
|
||||
else
|
||||
mmput(mm);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1714,6 +1737,7 @@ out_unlock:
|
|||
mutex_unlock(&cpuset_mutex);
|
||||
kernfs_unbreak_active_protection(of->kn);
|
||||
css_put(&cs->css);
|
||||
flush_workqueue(cpuset_migrate_mm_wq);
|
||||
return retval ?: nbytes;
|
||||
}
|
||||
|
||||
|
@ -2359,6 +2383,9 @@ void __init cpuset_init_smp(void)
|
|||
top_cpuset.effective_mems = node_states[N_MEMORY];
|
||||
|
||||
register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
|
||||
|
||||
cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
|
||||
BUG_ON(!cpuset_migrate_mm_wq);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
Loading…
Reference in New Issue