Merge branch 'sched/cpuset' into sched/urgent

This commit is contained in:
Ingo Molnar 2008-09-06 21:03:16 +02:00
commit 291c54ff76
3 changed files with 199 additions and 140 deletions

View File

@ -160,7 +160,7 @@ static inline int current_cpuset_is_being_rebound(void)
static inline void rebuild_sched_domains(void) static inline void rebuild_sched_domains(void)
{ {
partition_sched_domains(0, NULL, NULL); partition_sched_domains(1, NULL, NULL);
} }
#endif /* !CONFIG_CPUSETS */ #endif /* !CONFIG_CPUSETS */

View File

@ -14,6 +14,8 @@
* 2003-10-22 Updates by Stephen Hemminger. * 2003-10-22 Updates by Stephen Hemminger.
* 2004 May-July Rework by Paul Jackson. * 2004 May-July Rework by Paul Jackson.
* 2006 Rework by Paul Menage to use generic cgroups * 2006 Rework by Paul Menage to use generic cgroups
* 2008 Rework of the scheduler domains and CPU hotplug handling
* by Max Krasnyansky
* *
* This file is subject to the terms and conditions of the GNU General Public * This file is subject to the terms and conditions of the GNU General Public
* License. See the file COPYING in the main directory of the Linux * License. See the file COPYING in the main directory of the Linux
@ -236,9 +238,11 @@ static struct cpuset top_cpuset = {
static DEFINE_MUTEX(callback_mutex); static DEFINE_MUTEX(callback_mutex);
/* This is ugly, but preserves the userspace API for existing cpuset /*
* This is ugly, but preserves the userspace API for existing cpuset
* users. If someone tries to mount the "cpuset" filesystem, we * users. If someone tries to mount the "cpuset" filesystem, we
* silently switch it to mount "cgroup" instead */ * silently switch it to mount "cgroup" instead
*/
static int cpuset_get_sb(struct file_system_type *fs_type, static int cpuset_get_sb(struct file_system_type *fs_type,
int flags, const char *unused_dev_name, int flags, const char *unused_dev_name,
void *data, struct vfsmount *mnt) void *data, struct vfsmount *mnt)
@ -473,10 +477,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
} }
/* /*
* Helper routine for rebuild_sched_domains(). * Helper routine for generate_sched_domains().
* Do cpusets a, b have overlapping cpus_allowed masks? * Do cpusets a, b have overlapping cpus_allowed masks?
*/ */
static int cpusets_overlap(struct cpuset *a, struct cpuset *b) static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
{ {
return cpus_intersects(a->cpus_allowed, b->cpus_allowed); return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
@ -518,26 +521,15 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
} }
/* /*
* rebuild_sched_domains() * generate_sched_domains()
* *
* This routine will be called to rebuild the scheduler's dynamic * This function builds a partial partition of the systems CPUs
* sched domains: * A 'partial partition' is a set of non-overlapping subsets whose
* - if the flag 'sched_load_balance' of any cpuset with non-empty * union is a subset of that set.
* 'cpus' changes, * The output of this function needs to be passed to kernel/sched.c
* - or if the 'cpus' allowed changes in any cpuset which has that * partition_sched_domains() routine, which will rebuild the scheduler's
* flag enabled, * load balancing domains (sched domains) as specified by that partial
* - or if the 'sched_relax_domain_level' of any cpuset which has * partition.
* that flag enabled and with non-empty 'cpus' changes,
* - or if any cpuset with non-empty 'cpus' is removed,
* - or if a cpu gets offlined.
*
* This routine builds a partial partition of the systems CPUs
* (the set of non-overlappping cpumask_t's in the array 'part'
* below), and passes that partial partition to the kernel/sched.c
* partition_sched_domains() routine, which will rebuild the
* schedulers load balancing domains (sched domains) as specified
* by that partial partition. A 'partial partition' is a set of
* non-overlapping subsets whose union is a subset of that set.
* *
* See "What is sched_load_balance" in Documentation/cpusets.txt * See "What is sched_load_balance" in Documentation/cpusets.txt
* for a background explanation of this. * for a background explanation of this.
@ -547,13 +539,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
* domains when operating in the severe memory shortage situations * domains when operating in the severe memory shortage situations
* that could cause allocation failures below. * that could cause allocation failures below.
* *
* Call with cgroup_mutex held. May take callback_mutex during * Must be called with cgroup_lock held.
* call due to the kfifo_alloc() and kmalloc() calls. May nest
* a call to the get_online_cpus()/put_online_cpus() pair.
* Must not be called holding callback_mutex, because we must not
* call get_online_cpus() while holding callback_mutex. Elsewhere
* the kernel nests callback_mutex inside get_online_cpus() calls.
* So the reverse nesting would risk an ABBA deadlock.
* *
* The three key local variables below are: * The three key local variables below are:
* q - a linked-list queue of cpuset pointers, used to implement a * q - a linked-list queue of cpuset pointers, used to implement a
@ -588,10 +574,10 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
* element of the partition (one sched domain) to be passed to * element of the partition (one sched domain) to be passed to
* partition_sched_domains(). * partition_sched_domains().
*/ */
static int generate_sched_domains(cpumask_t **domains,
void rebuild_sched_domains(void) struct sched_domain_attr **attributes)
{ {
LIST_HEAD(q); /* queue of cpusets to be scanned*/ LIST_HEAD(q); /* queue of cpusets to be scanned */
struct cpuset *cp; /* scans q */ struct cpuset *cp; /* scans q */
struct cpuset **csa; /* array of all cpuset ptrs */ struct cpuset **csa; /* array of all cpuset ptrs */
int csn; /* how many cpuset ptrs in csa so far */ int csn; /* how many cpuset ptrs in csa so far */
@ -601,23 +587,26 @@ void rebuild_sched_domains(void)
int ndoms; /* number of sched domains in result */ int ndoms; /* number of sched domains in result */
int nslot; /* next empty doms[] cpumask_t slot */ int nslot; /* next empty doms[] cpumask_t slot */
csa = NULL; ndoms = 0;
doms = NULL; doms = NULL;
dattr = NULL; dattr = NULL;
csa = NULL;
/* Special case for the 99% of systems with one, full, sched domain */ /* Special case for the 99% of systems with one, full, sched domain */
if (is_sched_load_balance(&top_cpuset)) { if (is_sched_load_balance(&top_cpuset)) {
ndoms = 1;
doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
if (!doms) if (!doms)
goto rebuild; goto done;
dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
if (dattr) { if (dattr) {
*dattr = SD_ATTR_INIT; *dattr = SD_ATTR_INIT;
update_domain_attr_tree(dattr, &top_cpuset); update_domain_attr_tree(dattr, &top_cpuset);
} }
*doms = top_cpuset.cpus_allowed; *doms = top_cpuset.cpus_allowed;
goto rebuild;
ndoms = 1;
goto done;
} }
csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
@ -680,61 +669,141 @@ restart:
} }
} }
/* Convert <csn, csa> to <ndoms, doms> */ /*
* Now we know how many domains to create.
* Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
*/
doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
if (!doms) if (!doms) {
goto rebuild; ndoms = 0;
goto done;
}
/*
* The rest of the code, including the scheduler, can deal with
* dattr==NULL case. No need to abort if alloc fails.
*/
dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
for (nslot = 0, i = 0; i < csn; i++) { for (nslot = 0, i = 0; i < csn; i++) {
struct cpuset *a = csa[i]; struct cpuset *a = csa[i];
cpumask_t *dp;
int apn = a->pn; int apn = a->pn;
if (apn >= 0) { if (apn < 0) {
cpumask_t *dp = doms + nslot; /* Skip completed partitions */
continue;
if (nslot == ndoms) {
static int warnings = 10;
if (warnings) {
printk(KERN_WARNING
"rebuild_sched_domains confused:"
" nslot %d, ndoms %d, csn %d, i %d,"
" apn %d\n",
nslot, ndoms, csn, i, apn);
warnings--;
}
continue;
}
cpus_clear(*dp);
if (dattr)
*(dattr + nslot) = SD_ATTR_INIT;
for (j = i; j < csn; j++) {
struct cpuset *b = csa[j];
if (apn == b->pn) {
cpus_or(*dp, *dp, b->cpus_allowed);
b->pn = -1;
if (dattr)
update_domain_attr_tree(dattr
+ nslot, b);
}
}
nslot++;
} }
dp = doms + nslot;
if (nslot == ndoms) {
static int warnings = 10;
if (warnings) {
printk(KERN_WARNING
"rebuild_sched_domains confused:"
" nslot %d, ndoms %d, csn %d, i %d,"
" apn %d\n",
nslot, ndoms, csn, i, apn);
warnings--;
}
continue;
}
cpus_clear(*dp);
if (dattr)
*(dattr + nslot) = SD_ATTR_INIT;
for (j = i; j < csn; j++) {
struct cpuset *b = csa[j];
if (apn == b->pn) {
cpus_or(*dp, *dp, b->cpus_allowed);
if (dattr)
update_domain_attr_tree(dattr + nslot, b);
/* Done with this partition */
b->pn = -1;
}
}
nslot++;
} }
BUG_ON(nslot != ndoms); BUG_ON(nslot != ndoms);
rebuild:
/* Have scheduler rebuild sched domains */
get_online_cpus();
partition_sched_domains(ndoms, doms, dattr);
put_online_cpus();
done: done:
kfree(csa); kfree(csa);
/* Don't kfree(doms) -- partition_sched_domains() does that. */
/* Don't kfree(dattr) -- partition_sched_domains() does that. */ *domains = doms;
*attributes = dattr;
return ndoms;
}
/*
* Rebuild scheduler domains.
*
* Call with neither cgroup_mutex held nor within get_online_cpus().
* Takes both cgroup_mutex and get_online_cpus().
*
* Cannot be directly called from cpuset code handling changes
* to the cpuset pseudo-filesystem, because it cannot be called
* from code that already holds cgroup_mutex.
*/
static void do_rebuild_sched_domains(struct work_struct *unused)
{
struct sched_domain_attr *attr;
cpumask_t *doms;
int ndoms;
get_online_cpus();
/* Generate domain masks and attrs */
cgroup_lock();
ndoms = generate_sched_domains(&doms, &attr);
cgroup_unlock();
/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr);
put_online_cpus();
}
static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
/*
* Rebuild scheduler domains, asynchronously via workqueue.
*
* If the flag 'sched_load_balance' of any cpuset with non-empty
* 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
* which has that flag enabled, or if any cpuset with a non-empty
* 'cpus' is removed, then call this routine to rebuild the
* scheduler's dynamic sched domains.
*
* The rebuild_sched_domains() and partition_sched_domains()
* routines must nest cgroup_lock() inside get_online_cpus(),
* but such cpuset changes as these must nest that locking the
* other way, holding cgroup_lock() for much of the code.
*
* So in order to avoid an ABBA deadlock, the cpuset code handling
* these user changes delegates the actual sched domain rebuilding
* to a separate workqueue thread, which ends up processing the
* above do_rebuild_sched_domains() function.
*/
static void async_rebuild_sched_domains(void)
{
schedule_work(&rebuild_sched_domains_work);
}
/*
* Accomplishes the same scheduler domain rebuild as the above
* async_rebuild_sched_domains(), however it directly calls the
* rebuild routine synchronously rather than calling it via an
* asynchronous work thread.
*
* This can only be called from code that is not holding
* cgroup_mutex (not nested in a cgroup_lock() call.)
*/
void rebuild_sched_domains(void)
{
do_rebuild_sched_domains(NULL);
} }
/** /**
@ -863,7 +932,7 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
return retval; return retval;
if (is_load_balanced) if (is_load_balanced)
rebuild_sched_domains(); async_rebuild_sched_domains();
return 0; return 0;
} }
@ -1090,7 +1159,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
if (val != cs->relax_domain_level) { if (val != cs->relax_domain_level) {
cs->relax_domain_level = val; cs->relax_domain_level = val;
if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))
rebuild_sched_domains(); async_rebuild_sched_domains();
} }
return 0; return 0;
@ -1131,7 +1200,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
mutex_unlock(&callback_mutex); mutex_unlock(&callback_mutex);
if (cpus_nonempty && balance_flag_changed) if (cpus_nonempty && balance_flag_changed)
rebuild_sched_domains(); async_rebuild_sched_domains();
return 0; return 0;
} }
@ -1492,6 +1561,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
default: default:
BUG(); BUG();
} }
/* Unreachable but makes gcc happy */
return 0;
} }
static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
@ -1504,6 +1576,9 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
default: default:
BUG(); BUG();
} }
/* Unrechable but makes gcc happy */
return 0;
} }
@ -1692,15 +1767,9 @@ static struct cgroup_subsys_state *cpuset_create(
} }
/* /*
* Locking note on the strange update_flag() call below:
*
* If the cpuset being removed has its flag 'sched_load_balance' * If the cpuset being removed has its flag 'sched_load_balance'
* enabled, then simulate turning sched_load_balance off, which * enabled, then simulate turning sched_load_balance off, which
* will call rebuild_sched_domains(). The get_online_cpus() * will call async_rebuild_sched_domains().
* call in rebuild_sched_domains() must not be made while holding
* callback_mutex. Elsewhere the kernel nests callback_mutex inside
* get_online_cpus() calls. So the reverse nesting would risk an
* ABBA deadlock.
*/ */
static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
@ -1719,7 +1788,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
struct cgroup_subsys cpuset_subsys = { struct cgroup_subsys cpuset_subsys = {
.name = "cpuset", .name = "cpuset",
.create = cpuset_create, .create = cpuset_create,
.destroy = cpuset_destroy, .destroy = cpuset_destroy,
.can_attach = cpuset_can_attach, .can_attach = cpuset_can_attach,
.attach = cpuset_attach, .attach = cpuset_attach,
.populate = cpuset_populate, .populate = cpuset_populate,
@ -1811,7 +1880,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
} }
/* /*
* If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs * If CPU and/or memory hotplug handlers, below, unplug any CPUs
* or memory nodes, we need to walk over the cpuset hierarchy, * or memory nodes, we need to walk over the cpuset hierarchy,
* removing that CPU or node from all cpusets. If this removes the * removing that CPU or node from all cpusets. If this removes the
* last CPU or node from a cpuset, then move the tasks in the empty * last CPU or node from a cpuset, then move the tasks in the empty
@ -1902,35 +1971,6 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
} }
} }
/*
* The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
* cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to
* track what's online after any CPU or memory node hotplug or unplug event.
*
* Since there are two callers of this routine, one for CPU hotplug
* events and one for memory node hotplug events, we could have coded
* two separate routines here. We code it as a single common routine
* in order to minimize text size.
*/
static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
{
cgroup_lock();
top_cpuset.cpus_allowed = cpu_online_map;
top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
scan_for_empty_cpusets(&top_cpuset);
/*
* Scheduler destroys domains on hotplug events.
* Rebuild them based on the current settings.
*/
if (rebuild_sd)
rebuild_sched_domains();
cgroup_unlock();
}
/* /*
* The top_cpuset tracks what CPUs and Memory Nodes are online, * The top_cpuset tracks what CPUs and Memory Nodes are online,
* period. This is necessary in order to make cpusets transparent * period. This is necessary in order to make cpusets transparent
@ -1939,40 +1979,52 @@ static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
* *
* This routine ensures that top_cpuset.cpus_allowed tracks * This routine ensures that top_cpuset.cpus_allowed tracks
* cpu_online_map on each CPU hotplug (cpuhp) event. * cpu_online_map on each CPU hotplug (cpuhp) event.
*
* Called within get_online_cpus(). Needs to call cgroup_lock()
* before calling generate_sched_domains().
*/ */
static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
static int cpuset_handle_cpuhp(struct notifier_block *unused_nb,
unsigned long phase, void *unused_cpu) unsigned long phase, void *unused_cpu)
{ {
struct sched_domain_attr *attr;
cpumask_t *doms;
int ndoms;
switch (phase) { switch (phase) {
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
case CPU_ONLINE: case CPU_ONLINE:
case CPU_ONLINE_FROZEN: case CPU_ONLINE_FROZEN:
case CPU_DEAD: case CPU_DEAD:
case CPU_DEAD_FROZEN: case CPU_DEAD_FROZEN:
common_cpu_mem_hotplug_unplug(1);
break; break;
default: default:
return NOTIFY_DONE; return NOTIFY_DONE;
} }
cgroup_lock();
top_cpuset.cpus_allowed = cpu_online_map;
scan_for_empty_cpusets(&top_cpuset);
ndoms = generate_sched_domains(&doms, &attr);
cgroup_unlock();
/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr);
return NOTIFY_OK; return NOTIFY_OK;
} }
#ifdef CONFIG_MEMORY_HOTPLUG #ifdef CONFIG_MEMORY_HOTPLUG
/* /*
* Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
* Call this routine anytime after you change * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
* node_states[N_HIGH_MEMORY]. * See also the previous routine cpuset_track_online_cpus().
* See also the previous routine cpuset_handle_cpuhp().
*/ */
void cpuset_track_online_nodes(void) void cpuset_track_online_nodes(void)
{ {
common_cpu_mem_hotplug_unplug(0); cgroup_lock();
top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
scan_for_empty_cpusets(&top_cpuset);
cgroup_unlock();
} }
#endif #endif
@ -1987,7 +2039,7 @@ void __init cpuset_init_smp(void)
top_cpuset.cpus_allowed = cpu_online_map; top_cpuset.cpus_allowed = cpu_online_map;
top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
hotcpu_notifier(cpuset_handle_cpuhp, 0); hotcpu_notifier(cpuset_track_online_cpus, 0);
} }
/** /**

View File

@ -7696,24 +7696,27 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
* and partition_sched_domains() will fallback to the single partition * and partition_sched_domains() will fallback to the single partition
* 'fallback_doms', it also forces the domains to be rebuilt. * 'fallback_doms', it also forces the domains to be rebuilt.
* *
* If doms_new==NULL it will be replaced with cpu_online_map.
* ndoms_new==0 is a special case for destroying existing domains.
* It will not create the default domain.
*
* Call with hotplug lock held * Call with hotplug lock held
*/ */
void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
struct sched_domain_attr *dattr_new) struct sched_domain_attr *dattr_new)
{ {
int i, j; int i, j, n;
mutex_lock(&sched_domains_mutex); mutex_lock(&sched_domains_mutex);
/* always unregister in case we don't destroy any domains */ /* always unregister in case we don't destroy any domains */
unregister_sched_domain_sysctl(); unregister_sched_domain_sysctl();
if (doms_new == NULL) n = doms_new ? ndoms_new : 0;
ndoms_new = 0;
/* Destroy deleted domains */ /* Destroy deleted domains */
for (i = 0; i < ndoms_cur; i++) { for (i = 0; i < ndoms_cur; i++) {
for (j = 0; j < ndoms_new; j++) { for (j = 0; j < n; j++) {
if (cpus_equal(doms_cur[i], doms_new[j]) if (cpus_equal(doms_cur[i], doms_new[j])
&& dattrs_equal(dattr_cur, i, dattr_new, j)) && dattrs_equal(dattr_cur, i, dattr_new, j))
goto match1; goto match1;
@ -7726,7 +7729,6 @@ match1:
if (doms_new == NULL) { if (doms_new == NULL) {
ndoms_cur = 0; ndoms_cur = 0;
ndoms_new = 1;
doms_new = &fallback_doms; doms_new = &fallback_doms;
cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
dattr_new = NULL; dattr_new = NULL;
@ -7763,8 +7765,13 @@ match2:
int arch_reinit_sched_domains(void) int arch_reinit_sched_domains(void)
{ {
get_online_cpus(); get_online_cpus();
/* Destroy domains first to force the rebuild */
partition_sched_domains(0, NULL, NULL);
rebuild_sched_domains(); rebuild_sched_domains();
put_online_cpus(); put_online_cpus();
return 0; return 0;
} }
@ -7848,7 +7855,7 @@ static int update_sched_domains(struct notifier_block *nfb,
case CPU_ONLINE_FROZEN: case CPU_ONLINE_FROZEN:
case CPU_DEAD: case CPU_DEAD:
case CPU_DEAD_FROZEN: case CPU_DEAD_FROZEN:
partition_sched_domains(0, NULL, NULL); partition_sched_domains(1, NULL, NULL);
return NOTIFY_OK; return NOTIFY_OK;
default: default: