Merge branch 'x86-ras-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip into next

Pull x86 RAS changes from Ingo Molnar:
 "Improve mcheck device initialization and bootstrap robustness"

* 'x86-ras-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  mce: Panic when a core has reached a timeout
  x86/mce: Improve mcheck_init_device() error handling
This commit is contained in:
Linus Torvalds 2014-06-03 15:47:40 -07:00
commit 06b77b9733
1 changed files with 41 additions and 9 deletions

View File

@ -704,8 +704,7 @@ static int mce_timed_out(u64 *t)
if (!mca_cfg.monarch_timeout)
goto out;
if ((s64)*t < SPINUNIT) {
/* CHECKME: Make panic default for 1 too? */
if (mca_cfg.tolerant < 1)
if (mca_cfg.tolerant <= 1)
mce_panic("Timeout synchronizing machine check over CPUs",
NULL, NULL);
cpu_missing = 1;
@ -2437,32 +2436,65 @@ static __init int mcheck_init_device(void)
int err;
int i = 0;
if (!mce_available(&boot_cpu_data))
return -EIO;
if (!mce_available(&boot_cpu_data)) {
err = -EIO;
goto err_out;
}
zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
err = -ENOMEM;
goto err_out;
}
mce_init_banks();
err = subsys_system_register(&mce_subsys, NULL);
if (err)
return err;
goto err_out_mem;
cpu_notifier_register_begin();
for_each_online_cpu(i) {
err = mce_device_create(i);
if (err) {
cpu_notifier_register_done();
return err;
goto err_device_create;
}
}
register_syscore_ops(&mce_syscore_ops);
__register_hotcpu_notifier(&mce_cpu_notifier);
cpu_notifier_register_done();
register_syscore_ops(&mce_syscore_ops);
/* register character device /dev/mcelog */
misc_register(&mce_chrdev_device);
err = misc_register(&mce_chrdev_device);
if (err)
goto err_register;
return 0;
err_register:
unregister_syscore_ops(&mce_syscore_ops);
cpu_notifier_register_begin();
__unregister_hotcpu_notifier(&mce_cpu_notifier);
cpu_notifier_register_done();
err_device_create:
/*
* We didn't keep track of which devices were created above, but
* even if we had, the set of online cpus might have changed.
* Play safe and remove for every possible cpu, since
* mce_device_remove() will do the right thing.
*/
for_each_possible_cpu(i)
mce_device_remove(i);
err_out_mem:
free_cpumask_var(mce_device_initialized);
err_out:
pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
return err;
}