2019-05-22 15:51:28 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
2012-05-22 10:50:07 +08:00
|
|
|
/*
|
2008-03-20 01:26:14 +08:00
|
|
|
* x86 SMP booting functions
|
|
|
|
*
|
2009-01-05 22:08:04 +08:00
|
|
|
* (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
|
2009-01-31 09:03:42 +08:00
|
|
|
* (c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
|
2008-03-20 01:26:14 +08:00
|
|
|
* Copyright 2001 Andi Kleen, SuSE Labs.
|
|
|
|
*
|
|
|
|
* Much of the core SMP work is based on previous work by Thomas Radke, to
|
|
|
|
* whom a great many thanks are extended.
|
|
|
|
*
|
|
|
|
* Thanks to Intel for making available several different Pentium,
|
|
|
|
* Pentium Pro and Pentium-II/Xeon MP machines.
|
|
|
|
* Original development of Linux SMP code supported by Caldera.
|
|
|
|
*
|
|
|
|
* Fixes
|
|
|
|
* Felix Koop : NR_CPUS used properly
|
|
|
|
* Jose Renau : Handle single CPU case.
|
|
|
|
* Alan Cox : By repeated request 8) - Total BogoMIPS report.
|
|
|
|
* Greg Wright : Fix for kernel stacks panic.
|
|
|
|
* Erich Boleyn : MP v1.4 and additional changes.
|
|
|
|
* Matthias Sattler : Changes for 2.1 kernel map.
|
|
|
|
* Michel Lespinasse : Changes for 2.1 kernel map.
|
|
|
|
* Michael Chastain : Change trampoline.S to gnu as.
|
|
|
|
* Alan Cox : Dumb bug: 'B' step PPro's are fine
|
|
|
|
* Ingo Molnar : Added APIC timers, based on code
|
|
|
|
* from Jose Renau
|
|
|
|
* Ingo Molnar : various cleanups and rewrites
|
|
|
|
* Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug.
|
|
|
|
* Maciej W. Rozycki : Bits for genuine 82489DX APICs
|
|
|
|
* Andi Kleen : Changed for SMP boot into long mode.
|
|
|
|
* Martin J. Bligh : Added support for multi-quad systems
|
|
|
|
* Dave Jones : Report invalid combinations of Athlon CPUs.
|
|
|
|
* Rusty Russell : Hacked into shape for new "hotplug" boot process.
|
|
|
|
* Andi Kleen : Converted to new state machine.
|
|
|
|
* Ashok Raj : CPU hotplug support
|
|
|
|
* Glauber Costa : i386 and x86_64 integration
|
|
|
|
*/
|
|
|
|
|
2012-05-22 10:50:07 +08:00
|
|
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
|
|
|
|
2008-03-04 01:12:42 +08:00
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/smp.h>
|
2016-07-14 08:18:56 +08:00
|
|
|
#include <linux/export.h>
|
2008-03-04 01:13:03 +08:00
|
|
|
#include <linux/sched.h>
|
2017-02-01 23:36:40 +08:00
|
|
|
#include <linux/sched/topology.h>
|
2017-02-09 01:51:36 +08:00
|
|
|
#include <linux/sched/hotplug.h>
|
2017-02-09 01:51:37 +08:00
|
|
|
#include <linux/sched/task_stack.h>
|
2008-03-04 01:13:07 +08:00
|
|
|
#include <linux/percpu.h>
|
2018-10-31 06:09:49 +08:00
|
|
|
#include <linux/memblock.h>
|
2008-03-20 01:25:59 +08:00
|
|
|
#include <linux/err.h>
|
|
|
|
#include <linux/nmi.h>
|
2009-09-02 09:25:07 +08:00
|
|
|
#include <linux/tboot.h>
|
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 16:04:11 +08:00
|
|
|
#include <linux/gfp.h>
|
2012-03-14 02:55:09 +08:00
|
|
|
#include <linux/cpuidle.h>
|
x86/smp: Cure kexec() vs. mwait_play_dead() breakage
TLDR: It's a mess.
When kexec() is executed on a system with offline CPUs, which are parked in
mwait_play_dead() it can end up in a triple fault during the bootup of the
kexec kernel or cause hard to diagnose data corruption.
The reason is that kexec() eventually overwrites the previous kernel's text,
page tables, data and stack. If it writes to the cache line which is
monitored by a previously offlined CPU, MWAIT resumes execution and ends
up executing the wrong text, dereferencing overwritten page tables or
corrupting the kexec kernels data.
Cure this by bringing the offlined CPUs out of MWAIT into HLT.
Write to the monitored cache line of each offline CPU, which makes MWAIT
resume execution. The written control word tells the offlined CPUs to issue
HLT, which does not have the MWAIT problem.
That does not help, if a stray NMI, MCE or SMI hits the offlined CPUs as
those make it come out of HLT.
A follow up change will put them into INIT, which protects at least against
NMI and SMI.
Fixes: ea53069231f9 ("x86, hotplug: Use mwait to offline a processor, fix the legacy case")
Reported-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Ashok Raj <ashok.raj@intel.com>
Reviewed-by: Ashok Raj <ashok.raj@intel.com>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20230615193330.492257119@linutronix.de
2023-06-16 04:33:57 +08:00
|
|
|
#include <linux/kexec.h>
|
2019-03-06 07:42:58 +08:00
|
|
|
#include <linux/numa.h>
|
2020-06-09 12:32:42 +08:00
|
|
|
#include <linux/pgtable.h>
|
x86, sched: check for counters overflow in frequency invariant accounting
The product mcnt * arch_max_freq_ratio can overflows u64.
For context, a large value for arch_max_freq_ratio would be 5000,
corresponding to a turbo_freq/base_freq ratio of 5 (normally it's more like
1500-2000). A large increment frequency for the MPERF counter would be 5GHz
(the base clock of all CPUs on the market today is less than that). With
these figures, a CPU would need to go without a scheduler tick for around 8
days for the u64 overflow to happen. It is unlikely, but the check is
warranted.
Under similar conditions, the difference acnt of two consecutive APERF
readings can overflow as well.
In these circumstances is appropriate to disable frequency invariant
accounting: the feature relies on measures of the clock frequency done at
every scheduler tick, which need to be "fresh" to be at all meaningful.
A note on i386: prior to version 5.1, the GCC compiler didn't have the
builtin function __builtin_mul_overflow. In these GCC versions the macro
check_mul_overflow needs __udivdi3() to do (u64)a/b, which the kernel
doesn't provide. For this reason this change fails to build on i386 if
GCC<5.1, and we protect the entire frequency invariant code behind
CONFIG_X86_64 (special thanks to "kbuild test robot" <lkp@intel.com>).
Fixes: 1567c3e3467c ("x86, sched: Add support for frequency invariance")
Signed-off-by: Giovanni Gherdovich <ggherdovich@suse.cz>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lkml.kernel.org/r/20200531182453.15254-2-ggherdovich@suse.cz
2020-06-01 02:24:51 +08:00
|
|
|
#include <linux/overflow.h>
|
2022-10-24 04:06:00 +08:00
|
|
|
#include <linux/stackprotector.h>
|
2023-05-13 05:07:29 +08:00
|
|
|
#include <linux/cpuhotplug.h>
|
2023-05-13 05:07:56 +08:00
|
|
|
#include <linux/mc146818rtc.h>
|
2024-08-09 23:52:55 +08:00
|
|
|
#include <linux/acpi.h>
|
2008-03-04 01:13:07 +08:00
|
|
|
|
2008-03-20 01:26:11 +08:00
|
|
|
#include <asm/acpi.h>
|
2022-11-02 15:47:08 +08:00
|
|
|
#include <asm/cacheinfo.h>
|
2008-03-20 01:25:59 +08:00
|
|
|
#include <asm/desc.h>
|
2008-03-04 01:13:07 +08:00
|
|
|
#include <asm/nmi.h>
|
|
|
|
#include <asm/irq.h>
|
2012-05-09 02:22:28 +08:00
|
|
|
#include <asm/realmode.h>
|
2008-03-04 01:13:07 +08:00
|
|
|
#include <asm/cpu.h>
|
|
|
|
#include <asm/numa.h>
|
2008-03-20 01:25:59 +08:00
|
|
|
#include <asm/tlbflush.h>
|
|
|
|
#include <asm/mtrr.h>
|
2010-09-18 06:39:11 +08:00
|
|
|
#include <asm/mwait.h>
|
2009-02-17 20:58:15 +08:00
|
|
|
#include <asm/apic.h>
|
2011-02-22 22:38:05 +08:00
|
|
|
#include <asm/io_apic.h>
|
2021-10-15 09:16:39 +08:00
|
|
|
#include <asm/fpu/api.h>
|
x86: fix wakeup_cpu with numaq/es7000, v2
Impact: fix secondary-CPU wakeup/init path with numaq and es7000
While looking at wakeup_secondary_cpu for WAKE_SECONDARY_VIA_NMI:
|#ifdef WAKE_SECONDARY_VIA_NMI
|/*
| * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
| * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
| * won't ... remember to clear down the APIC, etc later.
| */
|static int __devinit
|wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
|{
| unsigned long send_status, accept_status = 0;
| int maxlvt;
|...
| if (APIC_INTEGRATED(apic_version[phys_apicid])) {
| maxlvt = lapic_get_maxlvt();
I noticed that there is no warning about undefined phys_apicid...
because WAKE_SECONDARY_VIA_NMI and WAKE_SECONDARY_VIA_INIT can not be
defined at the same time. So NUMAQ is using wrong wakeup_secondary_cpu.
WAKE_SECONDARY_VIA_NMI, WAKE_SECONDARY_VIA_INIT and
WAKE_SECONDARY_VIA_MIP are variants of a weird and fragile
preprocessor-driven "HAL" mechanisms to specify the kind of secondary-CPU
wakeup strategy a given x86 kernel will use.
The vast majority of systems want to use INIT for secondary wakeup - NUMAQ
uses an NMI, (old-style-) ES7000 uses 'MIP' (a firmware driven in-memory
flag to let secondaries continue).
So convert these mechanisms to x86_quirks and add a
->wakeup_secondary_cpu() method to specify the rare exception
to the sane default.
Extend genapic accordingly as well, for 32-bit.
While looking further, I noticed that functions in wakecup.h for numaq
and es7000 are different to the default in mach_wakecpu.h - but smpboot.c
will only use default mach_wakecpu.h with smphook.h.
So we need to add mach_wakecpu.h for mach_generic, to properly support
numaq and es7000, and vectorize the following SMP init methods:
int trampoline_phys_low;
int trampoline_phys_high;
void (*wait_for_init_deassert)(atomic_t *deassert);
void (*smp_callin_clear_local_apic)(void);
void (*store_NMI_vector)(unsigned short *high, unsigned short *low);
void (*restore_NMI_vector)(unsigned short *high, unsigned short *low);
void (*inquire_remote_apic)(int apicid);
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-11-16 19:12:49 +08:00
|
|
|
#include <asm/setup.h>
|
2009-01-21 16:26:06 +08:00
|
|
|
#include <asm/uv/uv.h>
|
2023-05-13 05:07:56 +08:00
|
|
|
#include <asm/microcode.h>
|
2009-11-10 03:27:04 +08:00
|
|
|
#include <asm/i8259.h>
|
x86: Improve the printout of the SMP bootup CPU table
As the new x86 CPU bootup printout format code maintainer, I am
taking immediate action to improve and clean (and thus indulge
my OCD) the reporting of the cores when coming up online.
Fix padding to a right-hand alignment, cleanup code and bind
reporting width to the max number of supported CPUs on the
system, like this:
[ 0.074509] smpboot: Booting Node 0, Processors: #1 #2 #3 #4 #5 #6 #7 OK
[ 0.644008] smpboot: Booting Node 1, Processors: #8 #9 #10 #11 #12 #13 #14 #15 OK
[ 1.245006] smpboot: Booting Node 2, Processors: #16 #17 #18 #19 #20 #21 #22 #23 OK
[ 1.864005] smpboot: Booting Node 3, Processors: #24 #25 #26 #27 #28 #29 #30 #31 OK
[ 2.489005] smpboot: Booting Node 4, Processors: #32 #33 #34 #35 #36 #37 #38 #39 OK
[ 3.093005] smpboot: Booting Node 5, Processors: #40 #41 #42 #43 #44 #45 #46 #47 OK
[ 3.698005] smpboot: Booting Node 6, Processors: #48 #49 #50 #51 #52 #53 #54 #55 OK
[ 4.304005] smpboot: Booting Node 7, Processors: #56 #57 #58 #59 #60 #61 #62 #63 OK
[ 4.961413] Brought up 64 CPUs
and this:
[ 0.072367] smpboot: Booting Node 0, Processors: #1 #2 #3 #4 #5 #6 #7 OK
[ 0.686329] Brought up 8 CPUs
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Libin <huawei.libin@huawei.com>
Cc: wangyijing@huawei.com
Cc: fenghua.yu@intel.com
Cc: guohanjun@huawei.com
Cc: paul.gortmaker@windriver.com
Link: http://lkml.kernel.org/r/20130927143554.GF4422@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-09-27 22:35:54 +08:00
|
|
|
#include <asm/misc.h>
|
2017-09-07 01:36:24 +08:00
|
|
|
#include <asm/qspinlock.h>
|
x86,sched: Allow topologies where NUMA nodes share an LLC
Intel's Skylake Server CPUs have a different LLC topology than previous
generations. When in Sub-NUMA-Clustering (SNC) mode, the package is divided
into two "slices", each containing half the cores, half the LLC, and one
memory controller and each slice is enumerated to Linux as a NUMA
node. This is similar to how the cores and LLC were arranged for the
Cluster-On-Die (CoD) feature.
CoD allowed the same cache line to be present in each half of the LLC.
But, with SNC, each line is only ever present in *one* slice. This means
that the portion of the LLC *available* to a CPU depends on the data being
accessed:
Remote socket: entire package LLC is shared
Local socket->local slice: data goes into local slice LLC
Local socket->remote slice: data goes into remote-slice LLC. Slightly
higher latency than local slice LLC.
The biggest implication from this is that a process accessing all
NUMA-local memory only sees half the LLC capacity.
The CPU describes its cache hierarchy with the CPUID instruction. One of
the CPUID leaves enumerates the "logical processors sharing this
cache". This information is used for scheduling decisions so that tasks
move more freely between CPUs sharing the cache.
But, the CPUID for the SNC configuration discussed above enumerates the LLC
as being shared by the entire package. This is not 100% precise because the
entire cache is not usable by all accesses. But, it *is* the way the
hardware enumerates itself, and this is not likely to change.
The userspace visible impact of all the above is that the sysfs info
reports the entire LLC as being available to the entire package. As noted
above, this is not true for local socket accesses. This patch does not
correct the sysfs info. It is the same, pre and post patch.
The current code emits the following warning:
sched: CPU #3's llc-sibling CPU #0 is not on the same node! [node: 1 != 0]. Ignoring dependency.
The warning is coming from the topology_sane() check in smpboot.c because
the topology is not matching the expectations of the model for obvious
reasons.
To fix this, add a vendor and model specific check to never call
topology_sane() for these systems. Also, just like "Cluster-on-Die" disable
the "coregroup" sched_domain_topology_level and use NUMA information from
the SRAT alone.
This is OK at least on the hardware we are immediately concerned about
because the LLC sharing happens at both the slice and at the package level,
which are also NUMA boundaries.
Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: brice.goglin@gmail.com
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: "H. Peter Anvin" <hpa@linux.intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Link: https://lkml.kernel.org/r/20180407002130.GA18984@alison-desk.jf.intel.com
2018-04-07 08:21:30 +08:00
|
|
|
#include <asm/intel-family.h>
|
|
|
|
#include <asm/cpu_device_id.h>
|
2018-05-10 03:53:09 +08:00
|
|
|
#include <asm/spec-ctrl.h>
|
2018-07-29 18:15:33 +08:00
|
|
|
#include <asm/hw_irq.h>
|
2020-06-18 06:56:24 +08:00
|
|
|
#include <asm/stackprotector.h>
|
2022-03-08 05:33:32 +08:00
|
|
|
#include <asm/sev.h>
|
2012-05-09 02:22:28 +08:00
|
|
|
|
2008-03-04 01:12:58 +08:00
|
|
|
/* representing HT siblings of each logical CPU */
|
x86: Add read_mostly declaration/definition to variables from smp.h
Add "read-mostly" qualifier to the following variables in
smp.h:
- cpu_sibling_map
- cpu_core_map
- cpu_llc_shared_map
- cpu_llc_id
- cpu_number
- x86_cpu_to_apicid
- x86_bios_cpu_apicid
- x86_cpu_to_logical_apicid
As long as all the variables above are only written during the
initialization, this change is meant to prevent the false
sharing. More specifically, on vSMP Foundation platform
x86_cpu_to_apicid shared the same internode_cache_line with
frequently written lapic_events.
From the analysis of the first 33 per_cpu variables out of 219
(memories they describe, to be more specific) the 8 have read_mostly
nature (tlb_vector_offset, cpu_loops_per_jiffy, xen_debug_irq, etc.)
and 25 are frequently written (irq_stack_union, gdt_page,
exception_stacks, idt_desc, etc.).
Assuming that the spread of the rest of the per_cpu variables is
similar, identifying the read mostly memories will make more sense
in terms of long-term code maintenance comparing to identifying
frequently written memories.
Signed-off-by: Vlad Zolotarov <vlad@scalemp.com>
Acked-by: Shai Fultheim <shai@scalemp.com>
Cc: Shai Fultheim (Shai@ScaleMP.com) <Shai@scalemp.com>
Cc: ido@wizery.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1719258.EYKzE4Zbq5@vlad
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2012-06-11 17:56:52 +08:00
|
|
|
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
|
2008-03-04 01:12:58 +08:00
|
|
|
EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
|
|
|
|
|
|
|
|
/* representing HT and core siblings of each logical CPU */
|
x86: Add read_mostly declaration/definition to variables from smp.h
Add "read-mostly" qualifier to the following variables in
smp.h:
- cpu_sibling_map
- cpu_core_map
- cpu_llc_shared_map
- cpu_llc_id
- cpu_number
- x86_cpu_to_apicid
- x86_bios_cpu_apicid
- x86_cpu_to_logical_apicid
As long as all the variables above are only written during the
initialization, this change is meant to prevent the false
sharing. More specifically, on vSMP Foundation platform
x86_cpu_to_apicid shared the same internode_cache_line with
frequently written lapic_events.
From the analysis of the first 33 per_cpu variables out of 219
(memories they describe, to be more specific) the 8 have read_mostly
nature (tlb_vector_offset, cpu_loops_per_jiffy, xen_debug_irq, etc.)
and 25 are frequently written (irq_stack_union, gdt_page,
exception_stacks, idt_desc, etc.).
Assuming that the spread of the rest of the per_cpu variables is
similar, identifying the read mostly memories will make more sense
in terms of long-term code maintenance comparing to identifying
frequently written memories.
Signed-off-by: Vlad Zolotarov <vlad@scalemp.com>
Acked-by: Shai Fultheim <shai@scalemp.com>
Cc: Shai Fultheim (Shai@ScaleMP.com) <Shai@scalemp.com>
Cc: ido@wizery.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1719258.EYKzE4Zbq5@vlad
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2012-06-11 17:56:52 +08:00
|
|
|
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
|
2008-03-04 01:12:58 +08:00
|
|
|
EXPORT_PER_CPU_SYMBOL(cpu_core_map);
|
|
|
|
|
2019-05-14 01:58:56 +08:00
|
|
|
/* representing HT, core, and die siblings of each logical CPU */
|
|
|
|
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map);
|
|
|
|
EXPORT_PER_CPU_SYMBOL(cpu_die_map);
|
|
|
|
|
2008-03-04 01:12:58 +08:00
|
|
|
/* Per CPU bogomips and other parameters */
|
2014-11-04 16:26:42 +08:00
|
|
|
DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
|
2008-03-04 01:12:58 +08:00
|
|
|
EXPORT_PER_CPU_SYMBOL(cpu_info);
|
2008-03-04 01:13:02 +08:00
|
|
|
|
2023-05-13 05:07:48 +08:00
|
|
|
/* CPUs which are the primary SMT threads */
|
|
|
|
struct cpumask __cpu_primary_thread_mask __read_mostly;
|
|
|
|
|
2023-05-13 05:07:16 +08:00
|
|
|
/* Representing CPUs for which sibling maps can be computed */
|
|
|
|
static cpumask_var_t cpu_sibling_setup_mask;
|
|
|
|
|
2023-06-16 04:33:55 +08:00
|
|
|
struct mwait_cpu_dead {
|
|
|
|
unsigned int control;
|
|
|
|
unsigned int status;
|
|
|
|
};
|
|
|
|
|
x86/smp: Cure kexec() vs. mwait_play_dead() breakage
TLDR: It's a mess.
When kexec() is executed on a system with offline CPUs, which are parked in
mwait_play_dead() it can end up in a triple fault during the bootup of the
kexec kernel or cause hard to diagnose data corruption.
The reason is that kexec() eventually overwrites the previous kernel's text,
page tables, data and stack. If it writes to the cache line which is
monitored by a previously offlined CPU, MWAIT resumes execution and ends
up executing the wrong text, dereferencing overwritten page tables or
corrupting the kexec kernels data.
Cure this by bringing the offlined CPUs out of MWAIT into HLT.
Write to the monitored cache line of each offline CPU, which makes MWAIT
resume execution. The written control word tells the offlined CPUs to issue
HLT, which does not have the MWAIT problem.
That does not help, if a stray NMI, MCE or SMI hits the offlined CPUs as
those make it come out of HLT.
A follow up change will put them into INIT, which protects at least against
NMI and SMI.
Fixes: ea53069231f9 ("x86, hotplug: Use mwait to offline a processor, fix the legacy case")
Reported-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Ashok Raj <ashok.raj@intel.com>
Reviewed-by: Ashok Raj <ashok.raj@intel.com>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20230615193330.492257119@linutronix.de
2023-06-16 04:33:57 +08:00
|
|
|
#define CPUDEAD_MWAIT_WAIT 0xDEADBEEF
|
|
|
|
#define CPUDEAD_MWAIT_KEXEC_HLT 0x4A17DEAD
|
|
|
|
|
2023-06-16 04:33:55 +08:00
|
|
|
/*
|
|
|
|
* Cache line aligned data for mwait_play_dead(). Separate on purpose so
|
|
|
|
* that it's unlikely to be touched by other CPUs.
|
|
|
|
*/
|
|
|
|
static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead);
|
|
|
|
|
2016-02-23 06:19:15 +08:00
|
|
|
/* Logical package management. We might want to allocate that dynamically */
|
|
|
|
unsigned int __max_logical_packages __read_mostly;
|
|
|
|
EXPORT_SYMBOL(__max_logical_packages);
|
x86/smp: Fix __max_logical_packages value setup
Frank reported kernel panic when he disabled several cores in BIOS
via following option:
Core Disable Bitmap(Hex) [0]
with number 0xFFE, which leaves 16 CPUs in system (out of 48).
The kernel panic below goes along with following messages:
smpboot: Max logical packages: 2^M
smpboot: APIC(0) Converting physical 0 to logical package 0^M
smpboot: APIC(20) Converting physical 1 to logical package 1^M
smpboot: APIC(40) Package 2 exceeds logical package map^M
smpboot: CPU 8 APICId 40 disabled^M
smpboot: APIC(60) Package 3 exceeds logical package map^M
smpboot: CPU 12 APICId 60 disabled^M
...
general protection fault: 0000 [#1] SMP^M
Modules linked in:^M
CPU: 15 PID: 1 Comm: swapper/0 Not tainted 4.7.0-rc5+ #1^M
Hardware name: SGI UV300/UV300, BIOS SGI UV 300 series BIOS 05/25/2016^M
task: ffff8801673e0000 ti: ffff8801673ac000 task.ti: ffff8801673ac000^M
RIP: 0010:[<ffffffff81014d54>] [<ffffffff81014d54>] uncore_change_context+0xd4/0x180^M
...
[<ffffffff810158ac>] uncore_event_init_cpu+0x6c/0x70^M
[<ffffffff81d8c91c>] intel_uncore_init+0x1c2/0x2dd^M
[<ffffffff81d8c75a>] ? uncore_cpu_setup+0x17/0x17^M
[<ffffffff81002190>] do_one_initcall+0x50/0x190^M
[<ffffffff810ab193>] ? parse_args+0x293/0x480^M
[<ffffffff81d87365>] kernel_init_freeable+0x1a5/0x249^M
[<ffffffff81d86a35>] ? set_debug_rodata+0x12/0x12^M
[<ffffffff816dc19e>] kernel_init+0xe/0x110^M
[<ffffffff816e93bf>] ret_from_fork+0x1f/0x40^M
[<ffffffff816dc190>] ? rest_init+0x80/0x80^M
The reason for the panic is wrong value of __max_logical_packages,
which lets logical_package_map uninitialized and the uncore code
relying on this map being properly initialized (maybe we should
add some safety checks there as well).
The __max_logical_packages is computed as:
DIV_ROUND_UP(total_cpus, ncpus);
- ncpus being number of cores
With above BIOS setup we get total_cpus == 16 which set
__max_logical_packages to 2 (ncpus is 12).
Once topology_update_package_map processes CPU with logical
pkg over 2 we display above messages and fail to initialize
the physical_to_logical_pkg map, which makes the uncore code
crash.
The fix is to remove logical_package_map bitmap completely
and keep and update the logical_packages number instead.
After we enumerate all the present CPUs, we check if the
enumerated logical packages count is within its computed
maximum from BIOS data.
If it's not the case, we set this maximum to the new enumerated
value and freeze any new addition of logical packages.
The freeze is because lot of init code like uncore/rapl/cqm
depends on having maximum logical package value set to allocate
their data, so we can't change it later on.
Prarit Bhargava tested the patch and confirms that it solves
the problem:
From dmidecode:
Core Count: 24
Core Enabled: 24
Thread Count: 48
Orig kernel boot log:
[ 0.464981] smpboot: Max logical packages: 19
[ 0.469861] smpboot: APIC(0) Converting physical 0 to logical package 0
[ 0.477261] smpboot: APIC(40) Converting physical 1 to logical package 1
[ 0.484760] smpboot: APIC(80) Converting physical 2 to logical package 2
[ 0.492258] smpboot: APIC(c0) Converting physical 3 to logical package 3
1. nr_cpus=8, should stop enumerating in package 0:
[ 0.533664] smpboot: APIC(0) Converting physical 0 to logical package 0
[ 0.539596] smpboot: Max logical packages: 19
2. max_cpus=8, should still enumerate all packages:
[ 0.526494] smpboot: APIC(0) Converting physical 0 to logical package 0
[ 0.532428] smpboot: APIC(40) Converting physical 1 to logical package 1
[ 0.538456] smpboot: APIC(80) Converting physical 2 to logical package 2
[ 0.544486] smpboot: APIC(c0) Converting physical 3 to logical package 3
[ 0.550524] smpboot: Max logical packages: 19
3. nr_cpus=49 ( 2 socket + 1 core on 3rd socket), should stop enumerating in
package 2:
[ 0.521378] smpboot: APIC(0) Converting physical 0 to logical package 0
[ 0.527314] smpboot: APIC(40) Converting physical 1 to logical package 1
[ 0.533345] smpboot: APIC(80) Converting physical 2 to logical package 2
[ 0.539368] smpboot: Max logical packages: 19
4. maxcpus=49, should still enumerate all packages:
[ 0.525591] smpboot: APIC(0) Converting physical 0 to logical package 0
[ 0.531525] smpboot: APIC(40) Converting physical 1 to logical package 1
[ 0.537547] smpboot: APIC(80) Converting physical 2 to logical package 2
[ 0.543579] smpboot: APIC(c0) Converting physical 3 to logical package 3
[ 0.549624] smpboot: Max logical packages: 19
5. kdump (nr_cpus=1) works as well.
Reported-by: Frank Ramsay <framsay@redhat.com>
Tested-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Reviewed-by: Prarit Bhargava <prarit@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160815101700.GA30090@krava
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-08-15 18:17:00 +08:00
|
|
|
static unsigned int logical_packages __read_mostly;
|
2019-05-14 01:58:49 +08:00
|
|
|
static unsigned int logical_die __read_mostly;
|
2016-02-23 06:19:15 +08:00
|
|
|
|
2016-05-20 08:09:55 +08:00
|
|
|
/* Maximum number of SMT threads on any online core */
|
2017-12-05 00:45:21 +08:00
|
|
|
int __read_mostly __max_smt_threads = 1;
|
2016-05-20 08:09:55 +08:00
|
|
|
|
2016-11-23 04:23:54 +08:00
|
|
|
/* Flag to indicate if a complete sched domain rebuild is required */
|
|
|
|
bool x86_topology_update;
|
|
|
|
|
|
|
|
int arch_update_cpu_topology(void)
|
|
|
|
{
|
|
|
|
int retval = x86_topology_update;
|
|
|
|
|
|
|
|
x86_topology_update = false;
|
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
|
2023-03-17 06:21:01 +08:00
|
|
|
static unsigned int smpboot_warm_reset_vector_count;
|
|
|
|
|
2015-01-16 05:22:29 +08:00
|
|
|
static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
|
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&rtc_lock, flags);
|
2023-03-17 06:21:01 +08:00
|
|
|
if (!smpboot_warm_reset_vector_count++) {
|
|
|
|
CMOS_WRITE(0xa, 0xf);
|
|
|
|
*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = start_eip >> 4;
|
|
|
|
*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = start_eip & 0xf;
|
|
|
|
}
|
2015-01-16 05:22:29 +08:00
|
|
|
spin_unlock_irqrestore(&rtc_lock, flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void smpboot_restore_warm_reset_vector(void)
|
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Paranoid: Set warm reset code and vector here back
|
|
|
|
* to default values.
|
|
|
|
*/
|
|
|
|
spin_lock_irqsave(&rtc_lock, flags);
|
2023-03-17 06:21:01 +08:00
|
|
|
if (!--smpboot_warm_reset_vector_count) {
|
|
|
|
CMOS_WRITE(0, 0xf);
|
|
|
|
*((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
|
|
|
|
}
|
2015-01-16 05:22:29 +08:00
|
|
|
spin_unlock_irqrestore(&rtc_lock, flags);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2023-05-13 05:07:19 +08:00
|
|
|
/* Run the next set of setup steps for the upcoming CPU */
|
|
|
|
static void ap_starting(void)
|
2008-03-20 01:25:59 +08:00
|
|
|
{
|
2023-05-13 05:07:19 +08:00
|
|
|
int cpuid = smp_processor_id();
|
2008-03-20 01:25:59 +08:00
|
|
|
|
x86/smp: Cure kexec() vs. mwait_play_dead() breakage
TLDR: It's a mess.
When kexec() is executed on a system with offline CPUs, which are parked in
mwait_play_dead() it can end up in a triple fault during the bootup of the
kexec kernel or cause hard to diagnose data corruption.
The reason is that kexec() eventually overwrites the previous kernel's text,
page tables, data and stack. If it writes to the cache line which is
monitored by a previously offlined CPU, MWAIT resumes execution and ends
up executing the wrong text, dereferencing overwritten page tables or
corrupting the kexec kernels data.
Cure this by bringing the offlined CPUs out of MWAIT into HLT.
Write to the monitored cache line of each offline CPU, which makes MWAIT
resume execution. The written control word tells the offlined CPUs to issue
HLT, which does not have the MWAIT problem.
That does not help, if a stray NMI, MCE or SMI hits the offlined CPUs as
those make it come out of HLT.
A follow up change will put them into INIT, which protects at least against
NMI and SMI.
Fixes: ea53069231f9 ("x86, hotplug: Use mwait to offline a processor, fix the legacy case")
Reported-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Ashok Raj <ashok.raj@intel.com>
Reviewed-by: Ashok Raj <ashok.raj@intel.com>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20230615193330.492257119@linutronix.de
2023-06-16 04:33:57 +08:00
|
|
|
/* Mop up eventual mwait_play_dead() wreckage */
|
|
|
|
this_cpu_write(mwait_cpu_dead.status, 0);
|
|
|
|
this_cpu_write(mwait_cpu_dead.control, 0);
|
|
|
|
|
2008-03-20 01:25:59 +08:00
|
|
|
/*
|
2023-05-13 05:07:29 +08:00
|
|
|
* If woken up by an INIT in an 82489DX configuration the alive
|
|
|
|
* synchronization guarantees that the CPU does not reach this
|
|
|
|
* point before an INIT_deassert IPI reaches the local APIC, so it
|
|
|
|
* is now safe to touch the local APIC.
|
2023-05-13 05:07:19 +08:00
|
|
|
*
|
|
|
|
* Set up this CPU, first the APIC, which is probably redundant on
|
|
|
|
* most boards.
|
2008-03-20 01:25:59 +08:00
|
|
|
*/
|
2015-01-16 05:22:40 +08:00
|
|
|
apic_ap_setup();
|
2008-03-20 01:25:59 +08:00
|
|
|
|
2023-05-13 05:07:19 +08:00
|
|
|
/* Save the processor parameters. */
|
2011-11-16 07:33:56 +08:00
|
|
|
smp_store_cpu_info(cpuid);
|
|
|
|
|
2017-10-28 08:11:00 +08:00
|
|
|
/*
|
|
|
|
* The topology information must be up to date before
|
2023-05-13 05:07:01 +08:00
|
|
|
* notify_cpu_starting().
|
2017-10-28 08:11:00 +08:00
|
|
|
*/
|
2023-05-13 05:07:19 +08:00
|
|
|
set_cpu_sibling_map(cpuid);
|
2017-10-28 08:11:00 +08:00
|
|
|
|
2022-04-16 03:19:59 +08:00
|
|
|
ap_init_aperfmperf();
|
x86, sched: Add support for frequency invariance
Implement arch_scale_freq_capacity() for 'modern' x86. This function
is used by the scheduler to correctly account usage in the face of
DVFS.
The present patch addresses Intel processors specifically and has positive
performance and performance-per-watt implications for the schedutil cpufreq
governor, bringing it closer to, if not on-par with, the powersave governor
from the intel_pstate driver/framework.
Large performance gains are obtained when the machine is lightly loaded and
no regression are observed at saturation. The benchmarks with the largest
gains are kernel compilation, tbench (the networking version of dbench) and
shell-intensive workloads.
1. FREQUENCY INVARIANCE: MOTIVATION
* Without it, a task looks larger if the CPU runs slower
2. PECULIARITIES OF X86
* freq invariance accounting requires knowing the ratio freq_curr/freq_max
2.1 CURRENT FREQUENCY
* Use delta_APERF / delta_MPERF * freq_base (a.k.a "BusyMHz")
2.2 MAX FREQUENCY
* It varies with time (turbo). As an approximation, we set it to a
constant, i.e. 4-cores turbo frequency.
3. EFFECTS ON THE SCHEDUTIL FREQUENCY GOVERNOR
* The invariant schedutil's formula has no feedback loop and reacts faster
to utilization changes
4. KNOWN LIMITATIONS
* In some cases tasks can't reach max util despite how hard they try
5. PERFORMANCE TESTING
5.1 MACHINES
* Skylake, Broadwell, Haswell
5.2 SETUP
* baseline Linux v5.2 w/ non-invariant schedutil. Tested freq_max = 1-2-3-4-8-12
active cores turbo w/ invariant schedutil, and intel_pstate/powersave
5.3 BENCHMARK RESULTS
5.3.1 NEUTRAL BENCHMARKS
* NAS Parallel Benchmark (HPC), hackbench
5.3.2 NON-NEUTRAL BENCHMARKS
* tbench (10-30% better), kernbench (10-15% better),
shell-intensive-scripts (30-50% better)
* no regressions
5.3.3 SELECTION OF DETAILED RESULTS
5.3.4 POWER CONSUMPTION, PERFORMANCE-PER-WATT
* dbench (5% worse on one machine), kernbench (3% worse),
tbench (5-10% better), shell-intensive-scripts (10-40% better)
6. MICROARCH'ES ADDRESSED HERE
* Xeon Core before Scalable Performance processors line (Xeon Gold/Platinum
etc have different MSRs semantic for querying turbo levels)
7. REFERENCES
* MMTests performance testing framework, github.com/gormanm/mmtests
+-------------------------------------------------------------------------+
| 1. FREQUENCY INVARIANCE: MOTIVATION
+-------------------------------------------------------------------------+
For example; suppose a CPU has two frequencies: 500 and 1000 Mhz. When
running a task that would consume 1/3rd of a CPU at 1000 MHz, it would
appear to consume 2/3rd (or 66.6%) when running at 500 MHz, giving the
false impression this CPU is almost at capacity, even though it can go
faster [*]. In a nutshell, without frequency scale-invariance tasks look
larger just because the CPU is running slower.
[*] (footnote: this assumes a linear frequency/performance relation; which
everybody knows to be false, but given realities its the best approximation
we can make.)
+-------------------------------------------------------------------------+
| 2. PECULIARITIES OF X86
+-------------------------------------------------------------------------+
Accounting for frequency changes in PELT signals requires the computation of
the ratio freq_curr / freq_max. On x86 neither of those terms is readily
available.
2.1 CURRENT FREQUENCY
====================
Since modern x86 has hardware control over the actual frequency we run
at (because amongst other things, Turbo-Mode), we cannot simply use
the frequency as requested through cpufreq.
Instead we use the APERF/MPERF MSRs to compute the effective frequency
over the recent past. Also, because reading MSRs is expensive, don't
do so every time we need the value, but amortize the cost by doing it
every tick.
2.2 MAX FREQUENCY
=================
Obtaining freq_max is also non-trivial because at any time the hardware can
provide a frequency boost to a selected subset of cores if the package has
enough power to spare (eg: Turbo Boost). This means that the maximum frequency
available to a given core changes with time.
The approach taken in this change is to arbitrarily set freq_max to a constant
value at boot. The value chosen is the "4-cores (4C) turbo frequency" on most
microarchitectures, after evaluating the following candidates:
* 1-core (1C) turbo frequency (the fastest turbo state available)
* around base frequency (a.k.a. max P-state)
* something in between, such as 4C turbo
To interpret these options, consider that this is the denominator in
freq_curr/freq_max, and that ratio will be used to scale PELT signals such as
util_avg and load_avg. A large denominator will undershoot (util_avg looks a
bit smaller than it really is), viceversa with a smaller denominator PELT
signals will tend to overshoot. Given that PELT drives frequency selection
in the schedutil governor, we will have:
freq_max set to | effect on DVFS
--------------------+------------------
1C turbo | power efficiency (lower freq choices)
base freq | performance (higher util_avg, higher freq requests)
4C turbo | a bit of both
4C turbo proves to be a good compromise in a number of benchmarks (see below).
+-------------------------------------------------------------------------+
| 3. EFFECTS ON THE SCHEDUTIL FREQUENCY GOVERNOR
+-------------------------------------------------------------------------+
Once an architecture implements a frequency scale-invariant utilization (the
PELT signal util_avg), schedutil switches its frequency selection formula from
freq_next = 1.25 * freq_curr * util [non-invariant util signal]
to
freq_next = 1.25 * freq_max * util [invariant util signal]
where, in the second formula, freq_max is set to the 1C turbo frequency (max
turbo). The advantage of the second formula, whose usage we unlock with this
patch, is that freq_next doesn't depend on the current frequency in an
iterative fashion, but can jump to any frequency in a single update. This
absence of feedback in the formula makes it quicker to react to utilization
changes and more robust against pathological instabilities.
Compare it to the update formula of intel_pstate/powersave:
freq_next = 1.25 * freq_max * Busy%
where again freq_max is 1C turbo and Busy% is the percentage of time not spent
idling (calculated with delta_MPERF / delta_TSC); essentially the same as
invariant schedutil, and largely responsible for intel_pstate/powersave good
reputation. The non-invariant schedutil formula is derived from the invariant
one by approximating util_inv with util_raw * freq_curr / freq_max, but this
has limitations.
Testing shows improved performances due to better frequency selections when
the machine is lightly loaded, and essentially no change in behaviour at
saturation / overutilization.
+-------------------------------------------------------------------------+
| 4. KNOWN LIMITATIONS
+-------------------------------------------------------------------------+
It's been shown that it is possible to create pathological scenarios where a
CPU-bound task cannot reach max utilization, if the normalizing factor
freq_max is fixed to a constant value (see [Lelli-2018]).
If freq_max is set to 4C turbo as we do here, one needs to peg at least 5
cores in a package doing some busywork, and observe that none of those task
will ever reach max util (1024) because they're all running at less than the
4C turbo frequency.
While this concern still applies, we believe the performance benefit of
frequency scale-invariant PELT signals outweights the cost of this limitation.
[Lelli-2018]
https://lore.kernel.org/lkml/20180517150418.GF22493@localhost.localdomain/
+-------------------------------------------------------------------------+
| 5. PERFORMANCE TESTING
+-------------------------------------------------------------------------+
5.1 MACHINES
============
We tested the patch on three machines, with Skylake, Broadwell and Haswell
CPUs. The details are below, together with the available turbo ratios as
reported by the appropriate MSRs.
* 8x-SKYLAKE-UMA:
Single socket E3-1240 v5, Skylake 4 cores/8 threads
Max EFFiciency, BASE frequency and available turbo levels (MHz):
EFFIC 800 |********
BASE 3500 |***********************************
4C 3700 |*************************************
3C 3800 |**************************************
2C 3900 |***************************************
1C 3900 |***************************************
* 80x-BROADWELL-NUMA:
Two sockets E5-2698 v4, 2x Broadwell 20 cores/40 threads
Max EFFiciency, BASE frequency and available turbo levels (MHz):
EFFIC 1200 |************
BASE 2200 |**********************
8C 2900 |*****************************
7C 3000 |******************************
6C 3100 |*******************************
5C 3200 |********************************
4C 3300 |*********************************
3C 3400 |**********************************
2C 3600 |************************************
1C 3600 |************************************
* 48x-HASWELL-NUMA
Two sockets E5-2670 v3, 2x Haswell 12 cores/24 threads
Max EFFiciency, BASE frequency and available turbo levels (MHz):
EFFIC 1200 |************
BASE 2300 |***********************
12C 2600 |**************************
11C 2600 |**************************
10C 2600 |**************************
9C 2600 |**************************
8C 2600 |**************************
7C 2600 |**************************
6C 2600 |**************************
5C 2700 |***************************
4C 2800 |****************************
3C 2900 |*****************************
2C 3100 |*******************************
1C 3100 |*******************************
5.2 SETUP
=========
* The baseline is Linux v5.2 with schedutil (non-invariant) and the intel_pstate
driver in passive mode.
* The rationale for choosing the various freq_max values to test have been to
try all the 1-2-3-4C turbo levels (note that 1C and 2C turbo are identical
on all machines), plus one more value closer to base_freq but still in the
turbo range (8C turbo for both 80x-BROADWELL-NUMA and 48x-HASWELL-NUMA).
* In addition we've run all tests with intel_pstate/powersave for comparison.
* The filesystem is always XFS, the userspace is openSUSE Leap 15.1.
* 8x-SKYLAKE-UMA is capable of HWP (Hardware-Managed P-States), so the runs
with active intel_pstate on this machine use that.
This gives, in terms of combinations tested on each machine:
* 8x-SKYLAKE-UMA
* Baseline: Linux v5.2, non-invariant schedutil, intel_pstate passive
* intel_pstate active + powersave + HWP
* invariant schedutil, freq_max = 1C turbo
* invariant schedutil, freq_max = 3C turbo
* invariant schedutil, freq_max = 4C turbo
* both 80x-BROADWELL-NUMA and 48x-HASWELL-NUMA
* [same as 8x-SKYLAKE-UMA, but no HWP capable]
* invariant schedutil, freq_max = 8C turbo
(which on 48x-HASWELL-NUMA is the same as 12C turbo, or "all cores turbo")
5.3 BENCHMARK RESULTS
=====================
5.3.1 NEUTRAL BENCHMARKS
------------------------
Tests that didn't show any measurable difference in performance on any of the
test machines between non-invariant schedutil and our patch are:
* NAS Parallel Benchmarks (NPB) using either MPI or openMP for IPC, any
computational kernel
* flexible I/O (FIO)
* hackbench (using threads or processes, and using pipes or sockets)
5.3.2 NON-NEUTRAL BENCHMARKS
----------------------------
What follow are summary tables where each benchmark result is given a score.
* A tilde (~) means a neutral result, i.e. no difference from baseline.
* Scores are computed with the ratio result_new / result_baseline, so a tilde
means a score of 1.00.
* The results in the score ratio are the geometric means of results running
the benchmark with different parameters (eg: for kernbench: using 1, 2, 4,
... number of processes; for pgbench: varying the number of clients, and so
on).
* The first three tables show higher-is-better kind of tests (i.e. measured in
operations/second), the subsequent three show lower-is-better kind of tests
(i.e. the workload is fixed and we measure elapsed time, think kernbench).
* "gitsource" is a name we made up for the test consisting in running the
entire unit tests suite of the Git SCM and measuring how long it takes. We
take it as a typical example of shell-intensive serialized workload.
* In the "I_PSTATE" column we have the results for intel_pstate/powersave. Other
columns show invariant schedutil for different values of freq_max. 4C turbo
is circled as it's the value we've chosen for the final implementation.
80x-BROADWELL-NUMA (comparison ratio; higher is better)
+------+
I_PSTATE 1C 3C | 4C | 8C
pgbench-ro 1.14 ~ ~ | 1.11 | 1.14
pgbench-rw ~ ~ ~ | ~ | ~
netperf-udp 1.06 ~ 1.06 | 1.05 | 1.07
netperf-tcp ~ 1.03 ~ | 1.01 | 1.02
tbench4 1.57 1.18 1.22 | 1.30 | 1.56
+------+
8x-SKYLAKE-UMA (comparison ratio; higher is better)
+------+
I_PSTATE/HWP 1C 3C | 4C |
pgbench-ro ~ ~ ~ | ~ |
pgbench-rw ~ ~ ~ | ~ |
netperf-udp ~ ~ ~ | ~ |
netperf-tcp ~ ~ ~ | ~ |
tbench4 1.30 1.14 1.14 | 1.16 |
+------+
48x-HASWELL-NUMA (comparison ratio; higher is better)
+------+
I_PSTATE 1C 3C | 4C | 12C
pgbench-ro 1.15 ~ ~ | 1.06 | 1.16
pgbench-rw ~ ~ ~ | ~ | ~
netperf-udp 1.05 0.97 1.04 | 1.04 | 1.02
netperf-tcp 0.96 1.01 1.01 | 1.01 | 1.01
tbench4 1.50 1.05 1.13 | 1.13 | 1.25
+------+
In the table above we see that active intel_pstate is slightly better than our
4C-turbo patch (both in reference to the baseline non-invariant schedutil) on
read-only pgbench and much better on tbench. Both cases are notable in which
it shows that lowering our freq_max (to 8C-turbo and 12C-turbo on
80x-BROADWELL-NUMA and 48x-HASWELL-NUMA respectively) helps invariant
schedutil to get closer.
If we ignore active intel_pstate and focus on the comparison with baseline
alone, there are several instances of double-digit performance improvement.
80x-BROADWELL-NUMA (comparison ratio; lower is better)
+------+
I_PSTATE 1C 3C | 4C | 8C
dbench4 1.23 0.95 0.95 | 0.95 | 0.95
kernbench 0.93 0.83 0.83 | 0.83 | 0.82
gitsource 0.98 0.49 0.49 | 0.49 | 0.48
+------+
8x-SKYLAKE-UMA (comparison ratio; lower is better)
+------+
I_PSTATE/HWP 1C 3C | 4C |
dbench4 ~ ~ ~ | ~ |
kernbench ~ ~ ~ | ~ |
gitsource 0.92 0.55 0.55 | 0.55 |
+------+
48x-HASWELL-NUMA (comparison ratio; lower is better)
+------+
I_PSTATE 1C 3C | 4C | 8C
dbench4 ~ ~ ~ | ~ | ~
kernbench 0.94 0.90 0.89 | 0.90 | 0.90
gitsource 0.97 0.69 0.69 | 0.69 | 0.69
+------+
dbench is not very remarkable here, unless we notice how poorly active
intel_pstate is performing on 80x-BROADWELL-NUMA: 23% regression versus
non-invariant schedutil. We repeated that run getting consistent results. Out
of scope for the patch at hand, but deserving future investigation. Other than
that, we previously ran this campaign with Linux v5.0 and saw the patch doing
better on dbench a the time. We haven't checked closely and can only speculate
at this point.
On the NUMA boxes kernbench gets 10-15% improvements on average; we'll see in
the detailed tables that the gains concentrate on low process counts (lightly
loaded machines).
The test we call "gitsource" (running the git unit test suite, a long-running
single-threaded shell script) appears rather spectacular in this table (gains
of 30-50% depending on the machine). It is to be noted, however, that
gitsource has no adjustable parameters (such as the number of jobs in
kernbench, which we average over in order to get a single-number summary
score) and is exactly the kind of low-parallelism workload that benefits the
most from this patch. When looking at the detailed tables of kernbench or
tbench4, at low process or client counts one can see similar numbers.
5.3.3 SELECTION OF DETAILED RESULTS
-----------------------------------
Machine : 48x-HASWELL-NUMA
Benchmark : tbench4 (i.e. dbench4 over the network, actually loopback)
Varying parameter : number of clients
Unit : MB/sec (higher is better)
5.2.0 vanilla (BASELINE) 5.2.0 intel_pstate 5.2.0 1C-turbo
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Hmean 1 126.73 +- 0.31% ( ) 315.91 +- 0.66% ( 149.28%) 125.03 +- 0.76% ( -1.34%)
Hmean 2 258.04 +- 0.62% ( ) 614.16 +- 0.51% ( 138.01%) 269.58 +- 1.45% ( 4.47%)
Hmean 4 514.30 +- 0.67% ( ) 1146.58 +- 0.54% ( 122.94%) 533.84 +- 1.99% ( 3.80%)
Hmean 8 1111.38 +- 2.52% ( ) 2159.78 +- 0.38% ( 94.33%) 1359.92 +- 1.56% ( 22.36%)
Hmean 16 2286.47 +- 1.36% ( ) 3338.29 +- 0.21% ( 46.00%) 2720.20 +- 0.52% ( 18.97%)
Hmean 32 4704.84 +- 0.35% ( ) 4759.03 +- 0.43% ( 1.15%) 4774.48 +- 0.30% ( 1.48%)
Hmean 64 7578.04 +- 0.27% ( ) 7533.70 +- 0.43% ( -0.59%) 7462.17 +- 0.65% ( -1.53%)
Hmean 128 6998.52 +- 0.16% ( ) 6987.59 +- 0.12% ( -0.16%) 6909.17 +- 0.14% ( -1.28%)
Hmean 192 6901.35 +- 0.25% ( ) 6913.16 +- 0.10% ( 0.17%) 6855.47 +- 0.21% ( -0.66%)
5.2.0 3C-turbo 5.2.0 4C-turbo 5.2.0 12C-turbo
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Hmean 1 128.43 +- 0.28% ( 1.34%) 130.64 +- 3.81% ( 3.09%) 153.71 +- 5.89% ( 21.30%)
Hmean 2 311.70 +- 6.15% ( 20.79%) 281.66 +- 3.40% ( 9.15%) 305.08 +- 5.70% ( 18.23%)
Hmean 4 641.98 +- 2.32% ( 24.83%) 623.88 +- 5.28% ( 21.31%) 906.84 +- 4.65% ( 76.32%)
Hmean 8 1633.31 +- 1.56% ( 46.96%) 1714.16 +- 0.93% ( 54.24%) 2095.74 +- 0.47% ( 88.57%)
Hmean 16 3047.24 +- 0.42% ( 33.27%) 3155.02 +- 0.30% ( 37.99%) 3634.58 +- 0.15% ( 58.96%)
Hmean 32 4734.31 +- 0.60% ( 0.63%) 4804.38 +- 0.23% ( 2.12%) 4674.62 +- 0.27% ( -0.64%)
Hmean 64 7699.74 +- 0.35% ( 1.61%) 7499.72 +- 0.34% ( -1.03%) 7659.03 +- 0.25% ( 1.07%)
Hmean 128 6935.18 +- 0.15% ( -0.91%) 6942.54 +- 0.10% ( -0.80%) 7004.85 +- 0.12% ( 0.09%)
Hmean 192 6901.62 +- 0.12% ( 0.00%) 6856.93 +- 0.10% ( -0.64%) 6978.74 +- 0.10% ( 1.12%)
This is one of the cases where the patch still can't surpass active
intel_pstate, not even when freq_max is as low as 12C-turbo. Otherwise, gains are
visible up to 16 clients and the saturated scenario is the same as baseline.
The scores in the summary table from the previous sections are ratios of
geometric means of the results over different clients, as seen in this table.
Machine : 80x-BROADWELL-NUMA
Benchmark : kernbench (kernel compilation)
Varying parameter : number of jobs
Unit : seconds (lower is better)
5.2.0 vanilla (BASELINE) 5.2.0 intel_pstate 5.2.0 1C-turbo
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Amean 2 379.68 +- 0.06% ( ) 330.20 +- 0.43% ( 13.03%) 285.93 +- 0.07% ( 24.69%)
Amean 4 200.15 +- 0.24% ( ) 175.89 +- 0.22% ( 12.12%) 153.78 +- 0.25% ( 23.17%)
Amean 8 106.20 +- 0.31% ( ) 95.54 +- 0.23% ( 10.03%) 86.74 +- 0.10% ( 18.32%)
Amean 16 56.96 +- 1.31% ( ) 53.25 +- 1.22% ( 6.50%) 48.34 +- 1.73% ( 15.13%)
Amean 32 34.80 +- 2.46% ( ) 33.81 +- 0.77% ( 2.83%) 30.28 +- 1.59% ( 12.99%)
Amean 64 26.11 +- 1.63% ( ) 25.04 +- 1.07% ( 4.10%) 22.41 +- 2.37% ( 14.16%)
Amean 128 24.80 +- 1.36% ( ) 23.57 +- 1.23% ( 4.93%) 21.44 +- 1.37% ( 13.55%)
Amean 160 24.85 +- 0.56% ( ) 23.85 +- 1.17% ( 4.06%) 21.25 +- 1.12% ( 14.49%)
5.2.0 3C-turbo 5.2.0 4C-turbo 5.2.0 8C-turbo
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Amean 2 284.08 +- 0.13% ( 25.18%) 283.96 +- 0.51% ( 25.21%) 285.05 +- 0.21% ( 24.92%)
Amean 4 153.18 +- 0.22% ( 23.47%) 154.70 +- 1.64% ( 22.71%) 153.64 +- 0.30% ( 23.24%)
Amean 8 87.06 +- 0.28% ( 18.02%) 86.77 +- 0.46% ( 18.29%) 86.78 +- 0.22% ( 18.28%)
Amean 16 48.03 +- 0.93% ( 15.68%) 47.75 +- 1.99% ( 16.17%) 47.52 +- 1.61% ( 16.57%)
Amean 32 30.23 +- 1.20% ( 13.14%) 30.08 +- 1.67% ( 13.57%) 30.07 +- 1.67% ( 13.60%)
Amean 64 22.59 +- 2.02% ( 13.50%) 22.63 +- 0.81% ( 13.32%) 22.42 +- 0.76% ( 14.12%)
Amean 128 21.37 +- 0.67% ( 13.82%) 21.31 +- 1.15% ( 14.07%) 21.17 +- 1.93% ( 14.63%)
Amean 160 21.68 +- 0.57% ( 12.76%) 21.18 +- 1.74% ( 14.77%) 21.22 +- 1.00% ( 14.61%)
The patch outperform active intel_pstate (and baseline) by a considerable
margin; the summary table from the previous section says 4C turbo and active
intel_pstate are 0.83 and 0.93 against baseline respectively, so 4C turbo is
0.83/0.93=0.89 against intel_pstate (~10% better on average). There is no
noticeable difference with regard to the value of freq_max.
Machine : 8x-SKYLAKE-UMA
Benchmark : gitsource (time to run the git unit test suite)
Varying parameter : none
Unit : seconds (lower is better)
5.2.0 vanilla 5.2.0 intel_pstate/hwp 5.2.0 1C-turbo
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Amean 858.85 +- 1.16% ( ) 791.94 +- 0.21% ( 7.79%) 474.95 ( 44.70%)
5.2.0 3C-turbo 5.2.0 4C-turbo
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Amean 475.26 +- 0.20% ( 44.66%) 474.34 +- 0.13% ( 44.77%)
In this test, which is of interest as representing shell-intensive
(i.e. fork-intensive) serialized workloads, invariant schedutil outperforms
intel_pstate/powersave by a whopping 40% margin.
5.3.4 POWER CONSUMPTION, PERFORMANCE-PER-WATT
---------------------------------------------
The following table shows average power consumption in watt for each
benchmark. Data comes from turbostat (package average), which in turn is read
from the RAPL interface on CPUs. We know the patch affects CPU frequencies so
it's reasonable to ignore other power consumers (such as memory or I/O). Also,
we don't have a power meter available in the lab so RAPL is the best we have.
turbostat sampled average power every 10 seconds for the entire duration of
each benchmark. We took all those values and averaged them (i.e. with don't
have detail on a per-parameter granularity, only on whole benchmarks).
80x-BROADWELL-NUMA (power consumption, watts)
+--------+
BASELINE I_PSTATE 1C 3C | 4C | 8C
pgbench-ro 130.01 142.77 131.11 132.45 | 134.65 | 136.84
pgbench-rw 68.30 60.83 71.45 71.70 | 71.65 | 72.54
dbench4 90.25 59.06 101.43 99.89 | 101.10 | 102.94
netperf-udp 65.70 69.81 66.02 68.03 | 68.27 | 68.95
netperf-tcp 88.08 87.96 88.97 88.89 | 88.85 | 88.20
tbench4 142.32 176.73 153.02 163.91 | 165.58 | 176.07
kernbench 92.94 101.95 114.91 115.47 | 115.52 | 115.10
gitsource 40.92 41.87 75.14 75.20 | 75.40 | 75.70
+--------+
8x-SKYLAKE-UMA (power consumption, watts)
+--------+
BASELINE I_PSTATE/HWP 1C 3C | 4C |
pgbench-ro 46.49 46.68 46.56 46.59 | 46.52 |
pgbench-rw 29.34 31.38 30.98 31.00 | 31.00 |
dbench4 27.28 27.37 27.49 27.41 | 27.38 |
netperf-udp 22.33 22.41 22.36 22.35 | 22.36 |
netperf-tcp 27.29 27.29 27.30 27.31 | 27.33 |
tbench4 41.13 45.61 43.10 43.33 | 43.56 |
kernbench 42.56 42.63 43.01 43.01 | 43.01 |
gitsource 13.32 13.69 17.33 17.30 | 17.35 |
+--------+
48x-HASWELL-NUMA (power consumption, watts)
+--------+
BASELINE I_PSTATE 1C 3C | 4C | 12C
pgbench-ro 128.84 136.04 129.87 132.43 | 132.30 | 134.86
pgbench-rw 37.68 37.92 37.17 37.74 | 37.73 | 37.31
dbench4 28.56 28.73 28.60 28.73 | 28.70 | 28.79
netperf-udp 56.70 60.44 56.79 57.42 | 57.54 | 57.52
netperf-tcp 75.49 75.27 75.87 76.02 | 76.01 | 75.95
tbench4 115.44 139.51 119.53 123.07 | 123.97 | 130.22
kernbench 83.23 91.55 95.58 95.69 | 95.72 | 96.04
gitsource 36.79 36.99 39.99 40.34 | 40.35 | 40.23
+--------+
A lower power consumption isn't necessarily better, it depends on what is done
with that energy. Here are tables with the ratio of performance-per-watt on
each machine and benchmark. Higher is always better; a tilde (~) means a
neutral ratio (i.e. 1.00).
80x-BROADWELL-NUMA (performance-per-watt ratios; higher is better)
+------+
I_PSTATE 1C 3C | 4C | 8C
pgbench-ro 1.04 1.06 0.94 | 1.07 | 1.08
pgbench-rw 1.10 0.97 0.96 | 0.96 | 0.97
dbench4 1.24 0.94 0.95 | 0.94 | 0.92
netperf-udp ~ 1.02 1.02 | ~ | 1.02
netperf-tcp ~ 1.02 ~ | ~ | 1.02
tbench4 1.26 1.10 1.06 | 1.12 | 1.26
kernbench 0.98 0.97 0.97 | 0.97 | 0.98
gitsource ~ 1.11 1.11 | 1.11 | 1.13
+------+
8x-SKYLAKE-UMA (performance-per-watt ratios; higher is better)
+------+
I_PSTATE/HWP 1C 3C | 4C |
pgbench-ro ~ ~ ~ | ~ |
pgbench-rw 0.95 0.97 0.96 | 0.96 |
dbench4 ~ ~ ~ | ~ |
netperf-udp ~ ~ ~ | ~ |
netperf-tcp ~ ~ ~ | ~ |
tbench4 1.17 1.09 1.08 | 1.10 |
kernbench ~ ~ ~ | ~ |
gitsource 1.06 1.40 1.40 | 1.40 |
+------+
48x-HASWELL-NUMA (performance-per-watt ratios; higher is better)
+------+
I_PSTATE 1C 3C | 4C | 12C
pgbench-ro 1.09 ~ 1.09 | 1.03 | 1.11
pgbench-rw ~ 0.86 ~ | ~ | 0.86
dbench4 ~ 1.02 1.02 | 1.02 | ~
netperf-udp ~ 0.97 1.03 | 1.02 | ~
netperf-tcp 0.96 ~ ~ | ~ | ~
tbench4 1.24 ~ 1.06 | 1.05 | 1.11
kernbench 0.97 0.97 0.98 | 0.97 | 0.96
gitsource 1.03 1.33 1.32 | 1.32 | 1.33
+------+
These results are overall pleasing: in plenty of cases we observe
performance-per-watt improvements. The few regressions (read/write pgbench and
dbench on the Broadwell machine) are of small magnitude. kernbench loses a few
percentage points (it has a 10-15% performance improvement, but apparently the
increase in power consumption is larger than that). tbench4 and gitsource, which
benefit the most from the patch, keep a positive score in this table which is
a welcome surprise; that suggests that in those particular workloads the
non-invariant schedutil (and active intel_pstate, too) makes some rather
suboptimal frequency selections.
+-------------------------------------------------------------------------+
| 6. MICROARCH'ES ADDRESSED HERE
+-------------------------------------------------------------------------+
The patch addresses Xeon Core processors that use MSR_PLATFORM_INFO and
MSR_TURBO_RATIO_LIMIT to advertise their base frequency and turbo frequencies
respectively. This excludes the recent Xeon Scalable Performance processors
line (Xeon Gold, Platinum etc) whose MSRs have to be parsed differently.
Subsequent patches will address:
* Xeon Scalable Performance processors and Atom Goldmont/Goldmont Plus
* Xeon Phi (Knights Landing, Knights Mill)
* Atom Silvermont
+-------------------------------------------------------------------------+
| 7. REFERENCES
+-------------------------------------------------------------------------+
Tests have been run with the help of the MMTests performance testing
framework, see github.com/gormanm/mmtests. The configuration file names for
the benchmark used are:
db-pgbench-timed-ro-small-xfs
db-pgbench-timed-rw-small-xfs
io-dbench4-async-xfs
network-netperf-unbound
network-tbench
scheduler-unbound
workload-kerndevel-xfs
workload-shellscripts-xfs
hpc-nas-c-class-mpi-full-xfs
hpc-nas-c-class-omp-full
All those benchmarks are generally available on the web:
pgbench: https://www.postgresql.org/docs/10/pgbench.html
netperf: https://hewlettpackard.github.io/netperf/
dbench/tbench: https://dbench.samba.org/
gitsource: git unit test suite, github.com/git/git
NAS Parallel Benchmarks: https://www.nas.nasa.gov/publications/npb.html
hackbench: https://people.redhat.com/mingo/cfs-scheduler/tools/hackbench.c
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Giovanni Gherdovich <ggherdovich@suse.cz>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Doug Smythies <dsmythies@telus.net>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lkml.kernel.org/r/20200122151617.531-2-ggherdovich@suse.cz
2020-01-22 23:16:12 +08:00
|
|
|
|
2008-07-22 03:35:38 +08:00
|
|
|
pr_debug("Stack at about %p\n", &cpuid);
|
2008-03-20 01:25:59 +08:00
|
|
|
|
2010-11-18 18:47:31 +08:00
|
|
|
wmb();
|
|
|
|
|
2023-05-13 05:07:11 +08:00
|
|
|
/*
|
|
|
|
* This runs the AP through all the cpuhp states to its target
|
|
|
|
* state CPUHP_ONLINE.
|
|
|
|
*/
|
2010-03-24 02:30:52 +08:00
|
|
|
notify_cpu_starting(cpuid);
|
2008-03-20 01:25:59 +08:00
|
|
|
}
|
2010-03-24 02:30:52 +08:00
|
|
|
|
2023-05-13 05:07:01 +08:00
|
|
|
static void ap_calibrate_delay(void)
|
|
|
|
{
|
2008-03-20 01:25:59 +08:00
|
|
|
/*
|
2023-05-13 05:07:01 +08:00
|
|
|
* Calibrate the delay loop and update loops_per_jiffy in cpu_data.
|
|
|
|
* smp_store_cpu_info() stored a value that is close but not as
|
|
|
|
* accurate as the value just calculated.
|
|
|
|
*
|
|
|
|
* As this is invoked after the TSC synchronization check,
|
|
|
|
* calibrate_delay_is_known() will skip the calibration routine
|
|
|
|
* when TSC is synchronized across sockets.
|
2008-03-20 01:25:59 +08:00
|
|
|
*/
|
2023-05-13 05:07:01 +08:00
|
|
|
calibrate_delay();
|
|
|
|
cpu_data(smp_processor_id()).loops_per_jiffy = loops_per_jiffy;
|
2008-03-20 01:25:59 +08:00
|
|
|
}
|
|
|
|
|
2008-03-20 01:26:00 +08:00
|
|
|
/*
|
|
|
|
* Activate a secondary processor.
|
|
|
|
*/
|
x86: delete __cpuinit usage from all x86 files
The __cpuinit type of throwaway sections might have made sense
some time ago when RAM was more constrained, but now the savings
do not offset the cost and complications. For example, the fix in
commit 5e427ec2d0 ("x86: Fix bit corruption at CPU resume time")
is a good example of the nasty type of bugs that can be created
with improper use of the various __init prefixes.
After a discussion on LKML[1] it was decided that cpuinit should go
the way of devinit and be phased out. Once all the users are gone,
we can then finally remove the macros themselves from linux/init.h.
Note that some harmless section mismatch warnings may result, since
notify_cpu_starting() and cpu_up() are arch independent (kernel/cpu.c)
are flagged as __cpuinit -- so if we remove the __cpuinit from
arch specific callers, we will also get section mismatch warnings.
As an intermediate step, we intend to turn the linux/init.h cpuinit
content into no-ops as early as possible, since that will get rid
of these warnings. In any case, they are temporary and harmless.
This removes all the arch/x86 uses of the __cpuinit macros from
all C files. x86 only had the one __CPUINIT used in assembly files,
and it wasn't paired off with a .previous or a __FINIT, so we can
delete it directly w/o any corresponding additional change there.
[1] https://lkml.org/lkml/2013/5/20/589
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2013-06-19 06:23:59 +08:00
|
|
|
static void notrace start_secondary(void *unused)
|
2008-03-20 01:26:00 +08:00
|
|
|
{
|
|
|
|
/*
|
2017-09-11 08:48:27 +08:00
|
|
|
* Don't put *anything* except direct CPU state initialization
|
|
|
|
* before cpu_init(), SMP booting is too fragile that we want to
|
|
|
|
* limit the things done here to the most necessary things.
|
2008-03-20 01:26:00 +08:00
|
|
|
*/
|
2019-07-11 03:42:46 +08:00
|
|
|
cr4_init();
|
2012-11-14 20:36:53 +08:00
|
|
|
|
2023-05-13 05:07:56 +08:00
|
|
|
/*
|
|
|
|
* 32-bit specific. 64-bit reaches this code with the correct page
|
|
|
|
* table established. Yet another historical divergence.
|
|
|
|
*/
|
|
|
|
if (IS_ENABLED(CONFIG_X86_32)) {
|
|
|
|
/* switch away from the initial page table */
|
|
|
|
load_cr3(swapper_pg_dir);
|
|
|
|
__flush_tlb_all();
|
|
|
|
}
|
|
|
|
|
2023-05-13 05:07:12 +08:00
|
|
|
cpu_init_exception_handling();
|
|
|
|
|
2023-05-13 05:07:11 +08:00
|
|
|
/*
|
x86/microcode/32: Move early loading after paging enable
Upstream commit: 0b62f6cb07738d7211d926c39f6946b87f72e792
Conflict: none
Backport-reason: Microcode restructuring backport.
32-bit loads microcode before paging is enabled. The commit which
introduced that has zero justification in the changelog. The cover
letter has slightly more content, but it does not give any technical
justification either:
"The problem in current microcode loading method is that we load a
microcode way, way too late; ideally we should load it before turning
paging on. This may only be practical on 32 bits since we can't get
to 64-bit mode without paging on, but we should still do it as early
as at all possible."
Handwaving word salad with zero technical content.
Someone claimed in an offlist conversation that this is required for
curing the ATOM erratum AAE44/AAF40/AAG38/AAH41. That erratum requires
an microcode update in order to make the usage of PSE safe. But during
early boot, PSE is completely irrelevant and it is evaluated way later.
Neither is it relevant for the AP on single core HT enabled CPUs as the
microcode loading on the AP is not doing anything.
On dual core CPUs there is a theoretical problem if a split of an
executable large page between enabling paging including PSE and loading
the microcode happens. But that's only theoretical, it's practically
irrelevant because the affected dual core CPUs are 64bit enabled and
therefore have paging and PSE enabled before loading the microcode on
the second core. So why would it work on 64-bit but not on 32-bit?
The erratum:
"AAG38 Code Fetch May Occur to Incorrect Address After a Large Page is
Split Into 4-Kbyte Pages
Problem: If software clears the PS (page size) bit in a present PDE
(page directory entry), that will cause linear addresses mapped through
this PDE to use 4-KByte pages instead of using a large page after old
TLB entries are invalidated. Due to this erratum, if a code fetch uses
this PDE before the TLB entry for the large page is invalidated then it
may fetch from a different physical address than specified by either the
old large page translation or the new 4-KByte page translation. This
erratum may also cause speculative code fetches from incorrect addresses."
The practical relevance for this is exactly zero because there is no
splitting of large text pages during early boot-time, i.e. between paging
enable and microcode loading, and neither during CPU hotplug.
IOW, this load microcode before paging enable is yet another voodoo
programming solution in search of a problem. What's worse is that it causes
at least two serious problems:
1) When stackprotector is enabled, the microcode loader code has the
stackprotector mechanics enabled. The read from the per CPU variable
__stack_chk_guard is always accessing the virtual address either
directly on UP or via %fs on SMP. In physical address mode this
results in an access to memory above 3GB. So this works by chance as
the hardware returns the same value when there is no RAM at this
physical address. When there is RAM populated above 3G then the read
is by chance the same as nothing changes that memory during the very
early boot stage. That's not necessarily true during runtime CPU
hotplug.
2) When function tracing is enabled, the relevant microcode loader
functions and the functions invoked from there will call into the
tracing code and evaluate global and per CPU variables in physical
address mode. What could potentially go wrong?
Cure this and move the microcode loading after the early paging enable, use
the new temporary initrd mapping and remove the gunk in the microcode
loader which is required to handle physical address mode.
Intel-SIG: commit 0b62f6cb0773 x86/microcode/32: Move early loading after paging enable.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20231017211722.348298216@linutronix.de
[ Aubrey Li: amend commit log ]
Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
2023-10-18 05:23:32 +08:00
|
|
|
* Load the microcode before reaching the AP alive synchronization
|
|
|
|
* point below so it is not part of the full per CPU serialized
|
|
|
|
* bringup part when "parallel" bringup is enabled.
|
2023-05-13 05:07:56 +08:00
|
|
|
*
|
|
|
|
* That's even safe when hyperthreading is enabled in the CPU as
|
|
|
|
* the core code starts the primary threads first and leaves the
|
|
|
|
* secondary threads waiting for SIPI. Loading microcode on
|
|
|
|
* physical cores concurrently is a safe operation.
|
|
|
|
*
|
|
|
|
* This covers both the Intel specific issue that concurrent
|
|
|
|
* microcode loading on SMT siblings must be prohibited and the
|
|
|
|
* vendor independent issue`that microcode loading which changes
|
|
|
|
* CPUID, MSRs etc. must be strictly serialized to maintain
|
|
|
|
* software state correctness.
|
|
|
|
*/
|
x86/microcode/32: Move early loading after paging enable
Upstream commit: 0b62f6cb07738d7211d926c39f6946b87f72e792
Conflict: none
Backport-reason: Microcode restructuring backport.
32-bit loads microcode before paging is enabled. The commit which
introduced that has zero justification in the changelog. The cover
letter has slightly more content, but it does not give any technical
justification either:
"The problem in current microcode loading method is that we load a
microcode way, way too late; ideally we should load it before turning
paging on. This may only be practical on 32 bits since we can't get
to 64-bit mode without paging on, but we should still do it as early
as at all possible."
Handwaving word salad with zero technical content.
Someone claimed in an offlist conversation that this is required for
curing the ATOM erratum AAE44/AAF40/AAG38/AAH41. That erratum requires
an microcode update in order to make the usage of PSE safe. But during
early boot, PSE is completely irrelevant and it is evaluated way later.
Neither is it relevant for the AP on single core HT enabled CPUs as the
microcode loading on the AP is not doing anything.
On dual core CPUs there is a theoretical problem if a split of an
executable large page between enabling paging including PSE and loading
the microcode happens. But that's only theoretical, it's practically
irrelevant because the affected dual core CPUs are 64bit enabled and
therefore have paging and PSE enabled before loading the microcode on
the second core. So why would it work on 64-bit but not on 32-bit?
The erratum:
"AAG38 Code Fetch May Occur to Incorrect Address After a Large Page is
Split Into 4-Kbyte Pages
Problem: If software clears the PS (page size) bit in a present PDE
(page directory entry), that will cause linear addresses mapped through
this PDE to use 4-KByte pages instead of using a large page after old
TLB entries are invalidated. Due to this erratum, if a code fetch uses
this PDE before the TLB entry for the large page is invalidated then it
may fetch from a different physical address than specified by either the
old large page translation or the new 4-KByte page translation. This
erratum may also cause speculative code fetches from incorrect addresses."
The practical relevance for this is exactly zero because there is no
splitting of large text pages during early boot-time, i.e. between paging
enable and microcode loading, and neither during CPU hotplug.
IOW, this load microcode before paging enable is yet another voodoo
programming solution in search of a problem. What's worse is that it causes
at least two serious problems:
1) When stackprotector is enabled, the microcode loader code has the
stackprotector mechanics enabled. The read from the per CPU variable
__stack_chk_guard is always accessing the virtual address either
directly on UP or via %fs on SMP. In physical address mode this
results in an access to memory above 3GB. So this works by chance as
the hardware returns the same value when there is no RAM at this
physical address. When there is RAM populated above 3G then the read
is by chance the same as nothing changes that memory during the very
early boot stage. That's not necessarily true during runtime CPU
hotplug.
2) When function tracing is enabled, the relevant microcode loader
functions and the functions invoked from there will call into the
tracing code and evaluate global and per CPU variables in physical
address mode. What could potentially go wrong?
Cure this and move the microcode loading after the early paging enable, use
the new temporary initrd mapping and remove the gunk in the microcode
loader which is required to handle physical address mode.
Intel-SIG: commit 0b62f6cb0773 x86/microcode/32: Move early loading after paging enable.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20231017211722.348298216@linutronix.de
[ Aubrey Li: amend commit log ]
Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
2023-10-18 05:23:32 +08:00
|
|
|
load_ucode_ap();
|
2023-05-13 05:07:56 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Synchronization point with the hotplug core. Sets this CPUs
|
|
|
|
* synchronization state to ALIVE and spin-waits for the control CPU to
|
2023-05-13 05:07:29 +08:00
|
|
|
* release this CPU for further bringup.
|
2023-05-13 05:07:11 +08:00
|
|
|
*/
|
2023-05-13 05:07:29 +08:00
|
|
|
cpuhp_ap_sync_alive();
|
2023-05-13 05:07:12 +08:00
|
|
|
|
|
|
|
cpu_init();
|
A large update for SMP management:
- Parallel CPU bringup
The reason why people are interested in parallel bringup is to shorten
the (kexec) reboot time of cloud servers to reduce the downtime of the
VM tenants.
The current fully serialized bringup does the following per AP:
1) Prepare callbacks (allocate, intialize, create threads)
2) Kick the AP alive (e.g. INIT/SIPI on x86)
3) Wait for the AP to report alive state
4) Let the AP continue through the atomic bringup
5) Let the AP run the threaded bringup to full online state
There are two significant delays:
#3 The time for an AP to report alive state in start_secondary() on
x86 has been measured in the range between 350us and 3.5ms
depending on vendor and CPU type, BIOS microcode size etc.
#4 The atomic bringup does the microcode update. This has been
measured to take up to ~8ms on the primary threads depending on
the microcode patch size to apply.
On a two socket SKL server with 56 cores (112 threads) the boot CPU
spends on current mainline about 800ms busy waiting for the APs to come
up and apply microcode. That's more than 80% of the actual onlining
procedure.
This can be reduced significantly by splitting the bringup mechanism
into two parts:
1) Run the prepare callbacks and kick the AP alive for each AP which
needs to be brought up.
The APs wake up, do their firmware initialization and run the low
level kernel startup code including microcode loading in parallel
up to the first synchronization point. (#1 and #2 above)
2) Run the rest of the bringup code strictly serialized per CPU
(#3 - #5 above) as it's done today.
Parallelizing that stage of the CPU bringup might be possible in
theory, but it's questionable whether required surgery would be
justified for a pretty small gain.
If the system is large enough the first AP is already waiting at the
first synchronization point when the boot CPU finished the wake-up of
the last AP. That reduces the AP bringup time on that SKL from ~800ms
to ~80ms, i.e. by a factor ~10x.
The actual gain varies wildly depending on the system, CPU, microcode
patch size and other factors. There are some opportunities to reduce
the overhead further, but that needs some deep surgery in the x86 CPU
bringup code.
For now this is only enabled on x86, but the core functionality
obviously works for all SMP capable architectures.
- Enhancements for SMP function call tracing so it is possible to locate
the scheduling and the actual execution points. That allows to measure
IPI delivery time precisely.
-----BEGIN PGP SIGNATURE-----
iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAmSZb/YTHHRnbHhAbGlu
dXRyb25peC5kZQAKCRCmGPVMDXSYoRoOD/9vAiGI3IhGyZcX/RjXxauSHf8Pmqll
05jUubFi5Vi3tKI1ubMOsnMmJTw2yy5xDyS/iGj7AcbRLq9uQd3iMtsXXHNBzo/X
FNxnuWTXYUj0vcOYJ+j4puBumFzzpRCprqccMInH0kUnSWzbnaQCeelicZORAf+w
zUYrswK4HpBXHDOnvPw6Z7MYQe+zyDQSwjSftstLyROzu+lCEw/9KUaysY2epShJ
wHClxS2XqMnpY4rJ/CmJAlRhD0Plb89zXyo6k9YZYVDWoAcmBZy6vaTO4qoR171L
37ApqrgsksMkjFycCMnmrFIlkeb7bkrYDQ5y+xqC3JPTlYDKOYmITV5fZ83HD77o
K7FAhl/CgkPq2Ec+d82GFLVBKR1rijbwHf7a0nhfUy0yMeaJCxGp4uQ45uQ09asi
a/VG2T38EgxVdseC92HRhcdd3pipwCb5wqjCH/XdhdlQrk9NfeIeP+TxF4QhADhg
dApp3ifhHSnuEul7+HNUkC6U+Zc8UeDPdu5lvxSTp2ooQ0JwaGgC5PJq3nI9RUi2
Vv826NHOknEjFInOQcwvp6SJPfcuSTF75Yx6xKz8EZ3HHxpvlolxZLq+3ohSfOKn
2efOuZO5bEu4S/G2tRDYcy+CBvNVSrtZmCVqSOS039c8quBWQV7cj0334cjzf+5T
TRiSzvssbYYmaw==
=Y8if
-----END PGP SIGNATURE-----
Merge tag 'smp-core-2023-06-26' of ssh://gitolite.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull SMP updates from Thomas Gleixner:
"A large update for SMP management:
- Parallel CPU bringup
The reason why people are interested in parallel bringup is to
shorten the (kexec) reboot time of cloud servers to reduce the
downtime of the VM tenants.
The current fully serialized bringup does the following per AP:
1) Prepare callbacks (allocate, intialize, create threads)
2) Kick the AP alive (e.g. INIT/SIPI on x86)
3) Wait for the AP to report alive state
4) Let the AP continue through the atomic bringup
5) Let the AP run the threaded bringup to full online state
There are two significant delays:
#3 The time for an AP to report alive state in start_secondary()
on x86 has been measured in the range between 350us and 3.5ms
depending on vendor and CPU type, BIOS microcode size etc.
#4 The atomic bringup does the microcode update. This has been
measured to take up to ~8ms on the primary threads depending
on the microcode patch size to apply.
On a two socket SKL server with 56 cores (112 threads) the boot CPU
spends on current mainline about 800ms busy waiting for the APs to
come up and apply microcode. That's more than 80% of the actual
onlining procedure.
This can be reduced significantly by splitting the bringup
mechanism into two parts:
1) Run the prepare callbacks and kick the AP alive for each AP
which needs to be brought up.
The APs wake up, do their firmware initialization and run the
low level kernel startup code including microcode loading in
parallel up to the first synchronization point. (#1 and #2
above)
2) Run the rest of the bringup code strictly serialized per CPU
(#3 - #5 above) as it's done today.
Parallelizing that stage of the CPU bringup might be possible
in theory, but it's questionable whether required surgery
would be justified for a pretty small gain.
If the system is large enough the first AP is already waiting at
the first synchronization point when the boot CPU finished the
wake-up of the last AP. That reduces the AP bringup time on that
SKL from ~800ms to ~80ms, i.e. by a factor ~10x.
The actual gain varies wildly depending on the system, CPU,
microcode patch size and other factors. There are some
opportunities to reduce the overhead further, but that needs some
deep surgery in the x86 CPU bringup code.
For now this is only enabled on x86, but the core functionality
obviously works for all SMP capable architectures.
- Enhancements for SMP function call tracing so it is possible to
locate the scheduling and the actual execution points. That allows
to measure IPI delivery time precisely"
* tag 'smp-core-2023-06-26' of ssh://gitolite.kernel.org/pub/scm/linux/kernel/git/tip/tip: (45 commits)
trace,smp: Add tracepoints for scheduling remotelly called functions
trace,smp: Add tracepoints around remotelly called functions
MAINTAINERS: Add CPU HOTPLUG entry
x86/smpboot: Fix the parallel bringup decision
x86/realmode: Make stack lock work in trampoline_compat()
x86/smp: Initialize cpu_primary_thread_mask late
cpu/hotplug: Fix off by one in cpuhp_bringup_mask()
x86/apic: Fix use of X{,2}APIC_ENABLE in asm with older binutils
x86/smpboot/64: Implement arch_cpuhp_init_parallel_bringup() and enable it
x86/smpboot: Support parallel startup of secondary CPUs
x86/smpboot: Implement a bit spinlock to protect the realmode stack
x86/apic: Save the APIC virtual base address
cpu/hotplug: Allow "parallel" bringup up to CPUHP_BP_KICK_AP_STATE
x86/apic: Provide cpu_primary_thread mask
x86/smpboot: Enable split CPU startup
cpu/hotplug: Provide a split up CPUHP_BRINGUP mechanism
cpu/hotplug: Reset task stack state in _cpu_up()
cpu/hotplug: Remove unused state functions
riscv: Switch to hotplug core state synchronization
parisc: Switch to hotplug core state synchronization
...
2023-06-27 04:59:56 +08:00
|
|
|
fpu__init_cpu();
|
2020-10-21 12:13:55 +08:00
|
|
|
rcu_cpu_starting(raw_smp_processor_id());
|
2017-09-18 00:03:51 +08:00
|
|
|
x86_cpuinit.early_percpu_clock_init();
|
|
|
|
|
2023-05-13 05:07:19 +08:00
|
|
|
ap_starting();
|
2017-09-18 00:03:51 +08:00
|
|
|
|
2023-05-13 05:07:17 +08:00
|
|
|
/* Check TSC synchronization with the control CPU. */
|
2023-05-13 05:07:01 +08:00
|
|
|
check_tsc_sync_target();
|
2017-09-18 00:03:51 +08:00
|
|
|
|
2008-03-20 01:26:00 +08:00
|
|
|
/*
|
2023-05-13 05:07:01 +08:00
|
|
|
* Calibrate the delay loop after the TSC synchronization check.
|
|
|
|
* This allows to skip the calibration when TSC is synchronized
|
|
|
|
* across sockets.
|
2008-03-20 01:26:00 +08:00
|
|
|
*/
|
2023-05-13 05:07:01 +08:00
|
|
|
ap_calibrate_delay();
|
2008-03-20 01:26:00 +08:00
|
|
|
|
2018-05-10 03:53:09 +08:00
|
|
|
speculative_store_bypass_ht_init();
|
|
|
|
|
2008-03-20 01:26:00 +08:00
|
|
|
/*
|
2017-09-14 05:29:40 +08:00
|
|
|
* Lock vector_lock, set CPU online and bring the vector
|
|
|
|
* allocator online. Online must be set with vector_lock held
|
|
|
|
* to prevent a concurrent irq setup/teardown from seeing a
|
|
|
|
* half valid vector space.
|
2008-03-20 01:26:00 +08:00
|
|
|
*/
|
2008-08-10 06:09:02 +08:00
|
|
|
lock_vector_lock();
|
2009-01-04 21:18:03 +08:00
|
|
|
set_cpu_online(smp_processor_id(), true);
|
2017-09-14 05:29:40 +08:00
|
|
|
lapic_online();
|
2008-08-10 06:09:02 +08:00
|
|
|
unlock_vector_lock();
|
2010-02-27 00:49:12 +08:00
|
|
|
x86_platform.nmi_init();
|
2008-03-20 01:26:00 +08:00
|
|
|
|
2008-09-07 17:29:58 +08:00
|
|
|
/* enable local interrupts */
|
|
|
|
local_irq_enable();
|
|
|
|
|
2009-08-19 18:35:53 +08:00
|
|
|
x86_cpuinit.setup_percpu_clockev();
|
2008-03-20 01:26:00 +08:00
|
|
|
|
|
|
|
wmb();
|
2016-02-27 02:43:40 +08:00
|
|
|
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
|
2008-03-20 01:26:00 +08:00
|
|
|
}
|
|
|
|
|
2017-11-14 20:42:56 +08:00
|
|
|
/**
|
|
|
|
* topology_phys_to_logical_pkg - Map a physical package id to a logical
|
2023-05-13 05:06:58 +08:00
|
|
|
* @phys_pkg: The physical package id to map
|
2017-11-14 20:42:56 +08:00
|
|
|
*
|
|
|
|
* Returns logical package id or -1 if not found
|
|
|
|
*/
|
|
|
|
int topology_phys_to_logical_pkg(unsigned int phys_pkg)
|
|
|
|
{
|
|
|
|
int cpu;
|
|
|
|
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
|
|
struct cpuinfo_x86 *c = &cpu_data(cpu);
|
|
|
|
|
|
|
|
if (c->initialized && c->phys_proc_id == phys_pkg)
|
|
|
|
return c->logical_proc_id;
|
|
|
|
}
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(topology_phys_to_logical_pkg);
|
2023-05-13 05:06:58 +08:00
|
|
|
|
2019-05-14 01:58:49 +08:00
|
|
|
/**
|
|
|
|
* topology_phys_to_logical_die - Map a physical die id to logical
|
2023-05-13 05:06:58 +08:00
|
|
|
* @die_id: The physical die id to map
|
|
|
|
* @cur_cpu: The CPU for which the mapping is done
|
2019-05-14 01:58:49 +08:00
|
|
|
*
|
|
|
|
* Returns logical die id or -1 if not found
|
|
|
|
*/
|
2023-05-13 05:06:58 +08:00
|
|
|
static int topology_phys_to_logical_die(unsigned int die_id, unsigned int cur_cpu)
|
2019-05-14 01:58:49 +08:00
|
|
|
{
|
2023-05-13 05:06:58 +08:00
|
|
|
int cpu, proc_id = cpu_data(cur_cpu).phys_proc_id;
|
2019-05-14 01:58:49 +08:00
|
|
|
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
|
|
struct cpuinfo_x86 *c = &cpu_data(cpu);
|
|
|
|
|
|
|
|
if (c->initialized && c->cpu_die_id == die_id &&
|
|
|
|
c->phys_proc_id == proc_id)
|
|
|
|
return c->logical_die_id;
|
|
|
|
}
|
|
|
|
return -1;
|
|
|
|
}
|
2017-11-14 20:42:56 +08:00
|
|
|
|
2016-12-12 18:04:53 +08:00
|
|
|
/**
|
|
|
|
* topology_update_package_map - Update the physical to logical package map
|
|
|
|
* @pkg: The physical package id as retrieved via CPUID
|
|
|
|
* @cpu: The cpu for which this is updated
|
|
|
|
*/
|
|
|
|
int topology_update_package_map(unsigned int pkg, unsigned int cpu)
|
2016-02-23 06:19:15 +08:00
|
|
|
{
|
2017-11-14 20:42:56 +08:00
|
|
|
int new;
|
2016-02-23 06:19:15 +08:00
|
|
|
|
2017-11-14 20:42:56 +08:00
|
|
|
/* Already available somewhere? */
|
|
|
|
new = topology_phys_to_logical_pkg(pkg);
|
|
|
|
if (new >= 0)
|
2016-02-23 06:19:15 +08:00
|
|
|
goto found;
|
|
|
|
|
x86/smp: Fix __max_logical_packages value setup
Frank reported kernel panic when he disabled several cores in BIOS
via following option:
Core Disable Bitmap(Hex) [0]
with number 0xFFE, which leaves 16 CPUs in system (out of 48).
The kernel panic below goes along with following messages:
smpboot: Max logical packages: 2^M
smpboot: APIC(0) Converting physical 0 to logical package 0^M
smpboot: APIC(20) Converting physical 1 to logical package 1^M
smpboot: APIC(40) Package 2 exceeds logical package map^M
smpboot: CPU 8 APICId 40 disabled^M
smpboot: APIC(60) Package 3 exceeds logical package map^M
smpboot: CPU 12 APICId 60 disabled^M
...
general protection fault: 0000 [#1] SMP^M
Modules linked in:^M
CPU: 15 PID: 1 Comm: swapper/0 Not tainted 4.7.0-rc5+ #1^M
Hardware name: SGI UV300/UV300, BIOS SGI UV 300 series BIOS 05/25/2016^M
task: ffff8801673e0000 ti: ffff8801673ac000 task.ti: ffff8801673ac000^M
RIP: 0010:[<ffffffff81014d54>] [<ffffffff81014d54>] uncore_change_context+0xd4/0x180^M
...
[<ffffffff810158ac>] uncore_event_init_cpu+0x6c/0x70^M
[<ffffffff81d8c91c>] intel_uncore_init+0x1c2/0x2dd^M
[<ffffffff81d8c75a>] ? uncore_cpu_setup+0x17/0x17^M
[<ffffffff81002190>] do_one_initcall+0x50/0x190^M
[<ffffffff810ab193>] ? parse_args+0x293/0x480^M
[<ffffffff81d87365>] kernel_init_freeable+0x1a5/0x249^M
[<ffffffff81d86a35>] ? set_debug_rodata+0x12/0x12^M
[<ffffffff816dc19e>] kernel_init+0xe/0x110^M
[<ffffffff816e93bf>] ret_from_fork+0x1f/0x40^M
[<ffffffff816dc190>] ? rest_init+0x80/0x80^M
The reason for the panic is wrong value of __max_logical_packages,
which lets logical_package_map uninitialized and the uncore code
relying on this map being properly initialized (maybe we should
add some safety checks there as well).
The __max_logical_packages is computed as:
DIV_ROUND_UP(total_cpus, ncpus);
- ncpus being number of cores
With above BIOS setup we get total_cpus == 16 which set
__max_logical_packages to 2 (ncpus is 12).
Once topology_update_package_map processes CPU with logical
pkg over 2 we display above messages and fail to initialize
the physical_to_logical_pkg map, which makes the uncore code
crash.
The fix is to remove logical_package_map bitmap completely
and keep and update the logical_packages number instead.
After we enumerate all the present CPUs, we check if the
enumerated logical packages count is within its computed
maximum from BIOS data.
If it's not the case, we set this maximum to the new enumerated
value and freeze any new addition of logical packages.
The freeze is because lot of init code like uncore/rapl/cqm
depends on having maximum logical package value set to allocate
their data, so we can't change it later on.
Prarit Bhargava tested the patch and confirms that it solves
the problem:
From dmidecode:
Core Count: 24
Core Enabled: 24
Thread Count: 48
Orig kernel boot log:
[ 0.464981] smpboot: Max logical packages: 19
[ 0.469861] smpboot: APIC(0) Converting physical 0 to logical package 0
[ 0.477261] smpboot: APIC(40) Converting physical 1 to logical package 1
[ 0.484760] smpboot: APIC(80) Converting physical 2 to logical package 2
[ 0.492258] smpboot: APIC(c0) Converting physical 3 to logical package 3
1. nr_cpus=8, should stop enumerating in package 0:
[ 0.533664] smpboot: APIC(0) Converting physical 0 to logical package 0
[ 0.539596] smpboot: Max logical packages: 19
2. max_cpus=8, should still enumerate all packages:
[ 0.526494] smpboot: APIC(0) Converting physical 0 to logical package 0
[ 0.532428] smpboot: APIC(40) Converting physical 1 to logical package 1
[ 0.538456] smpboot: APIC(80) Converting physical 2 to logical package 2
[ 0.544486] smpboot: APIC(c0) Converting physical 3 to logical package 3
[ 0.550524] smpboot: Max logical packages: 19
3. nr_cpus=49 ( 2 socket + 1 core on 3rd socket), should stop enumerating in
package 2:
[ 0.521378] smpboot: APIC(0) Converting physical 0 to logical package 0
[ 0.527314] smpboot: APIC(40) Converting physical 1 to logical package 1
[ 0.533345] smpboot: APIC(80) Converting physical 2 to logical package 2
[ 0.539368] smpboot: Max logical packages: 19
4. maxcpus=49, should still enumerate all packages:
[ 0.525591] smpboot: APIC(0) Converting physical 0 to logical package 0
[ 0.531525] smpboot: APIC(40) Converting physical 1 to logical package 1
[ 0.537547] smpboot: APIC(80) Converting physical 2 to logical package 2
[ 0.543579] smpboot: APIC(c0) Converting physical 3 to logical package 3
[ 0.549624] smpboot: Max logical packages: 19
5. kdump (nr_cpus=1) works as well.
Reported-by: Frank Ramsay <framsay@redhat.com>
Tested-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Reviewed-by: Prarit Bhargava <prarit@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160815101700.GA30090@krava
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-08-15 18:17:00 +08:00
|
|
|
new = logical_packages++;
|
2016-12-12 18:04:53 +08:00
|
|
|
if (new != pkg) {
|
|
|
|
pr_info("CPU %u Converting physical %u to logical package %u\n",
|
|
|
|
cpu, pkg, new);
|
|
|
|
}
|
2016-02-23 06:19:15 +08:00
|
|
|
found:
|
2017-11-14 20:42:56 +08:00
|
|
|
cpu_data(cpu).logical_proc_id = new;
|
2016-02-23 06:19:15 +08:00
|
|
|
return 0;
|
|
|
|
}
|
2019-05-14 01:58:49 +08:00
|
|
|
/**
|
|
|
|
* topology_update_die_map - Update the physical to logical die map
|
|
|
|
* @die: The die id as retrieved via CPUID
|
|
|
|
* @cpu: The cpu for which this is updated
|
|
|
|
*/
|
|
|
|
int topology_update_die_map(unsigned int die, unsigned int cpu)
|
|
|
|
{
|
|
|
|
int new;
|
|
|
|
|
|
|
|
/* Already available somewhere? */
|
|
|
|
new = topology_phys_to_logical_die(die, cpu);
|
|
|
|
if (new >= 0)
|
|
|
|
goto found;
|
|
|
|
|
|
|
|
new = logical_die++;
|
|
|
|
if (new != die) {
|
|
|
|
pr_info("CPU %u Converting physical %u to logical die %u\n",
|
|
|
|
cpu, die, new);
|
|
|
|
}
|
|
|
|
found:
|
|
|
|
cpu_data(cpu).logical_die_id = new;
|
|
|
|
return 0;
|
|
|
|
}
|
2016-02-23 06:19:15 +08:00
|
|
|
|
2023-07-28 02:05:33 +08:00
|
|
|
static void __init smp_store_boot_cpu_info(void)
|
2012-11-14 03:32:41 +08:00
|
|
|
{
|
|
|
|
int id = 0; /* CPU 0 */
|
|
|
|
struct cpuinfo_x86 *c = &cpu_data(id);
|
|
|
|
|
|
|
|
*c = boot_cpu_data;
|
|
|
|
c->cpu_index = id;
|
2017-11-14 20:42:57 +08:00
|
|
|
topology_update_package_map(c->phys_proc_id, id);
|
2019-05-14 01:58:49 +08:00
|
|
|
topology_update_die_map(c->cpu_die_id, id);
|
2017-11-14 20:42:56 +08:00
|
|
|
c->initialized = true;
|
2012-11-14 03:32:41 +08:00
|
|
|
}
|
|
|
|
|
2008-03-20 01:25:05 +08:00
|
|
|
/*
|
|
|
|
* The bootstrap kernel entry code has set these up. Save them for
|
|
|
|
* a given CPU
|
|
|
|
*/
|
x86: delete __cpuinit usage from all x86 files
The __cpuinit type of throwaway sections might have made sense
some time ago when RAM was more constrained, but now the savings
do not offset the cost and complications. For example, the fix in
commit 5e427ec2d0 ("x86: Fix bit corruption at CPU resume time")
is a good example of the nasty type of bugs that can be created
with improper use of the various __init prefixes.
After a discussion on LKML[1] it was decided that cpuinit should go
the way of devinit and be phased out. Once all the users are gone,
we can then finally remove the macros themselves from linux/init.h.
Note that some harmless section mismatch warnings may result, since
notify_cpu_starting() and cpu_up() are arch independent (kernel/cpu.c)
are flagged as __cpuinit -- so if we remove the __cpuinit from
arch specific callers, we will also get section mismatch warnings.
As an intermediate step, we intend to turn the linux/init.h cpuinit
content into no-ops as early as possible, since that will get rid
of these warnings. In any case, they are temporary and harmless.
This removes all the arch/x86 uses of the __cpuinit macros from
all C files. x86 only had the one __CPUINIT used in assembly files,
and it wasn't paired off with a .previous or a __FINIT, so we can
delete it directly w/o any corresponding additional change there.
[1] https://lkml.org/lkml/2013/5/20/589
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2013-06-19 06:23:59 +08:00
|
|
|
void smp_store_cpu_info(int id)
|
2008-03-20 01:25:05 +08:00
|
|
|
{
|
|
|
|
struct cpuinfo_x86 *c = &cpu_data(id);
|
|
|
|
|
2017-11-14 20:42:56 +08:00
|
|
|
/* Copy boot_cpu_data only on the first bringup */
|
|
|
|
if (!c->initialized)
|
|
|
|
*c = boot_cpu_data;
|
2008-03-20 01:25:05 +08:00
|
|
|
c->cpu_index = id;
|
2012-11-14 03:32:41 +08:00
|
|
|
/*
|
|
|
|
* During boot time, CPU0 has this setup already. Save the info when
|
2023-05-13 05:07:06 +08:00
|
|
|
* bringing up an AP.
|
2012-11-14 03:32:41 +08:00
|
|
|
*/
|
|
|
|
identify_secondary_cpu(c);
|
2017-11-14 20:42:56 +08:00
|
|
|
c->initialized = true;
|
2008-03-20 01:25:05 +08:00
|
|
|
}
|
|
|
|
|
x86, sched: Add new topology for multi-NUMA-node CPUs
I'm getting the spew below when booting with Haswell (Xeon
E5-2699 v3) CPUs and the "Cluster-on-Die" (CoD) feature enabled
in the BIOS. It seems similar to the issue that some folks from
AMD ran in to on their systems and addressed in this commit:
161270fc1f9d ("x86/smp: Fix topology checks on AMD MCM CPUs")
Both these Intel and AMD systems break an assumption which is
being enforced by topology_sane(): a socket may not contain more
than one NUMA node.
AMD special-cased their system by looking for a cpuid flag. The
Intel mode is dependent on BIOS options and I do not know of a
way which it is enumerated other than the tables being parsed
during the CPU bringup process. In other words, we have to trust
the ACPI tables <shudder>.
This detects the situation where a NUMA node occurs at a place in
the middle of the "CPU" sched domains. It replaces the default
topology with one that relies on the NUMA information from the
firmware (SRAT table) for all levels of sched domains above the
hyperthreads.
This also fixes a sysfs bug. We used to freak out when we saw
the "mc" group cross a node boundary, so we stopped building the
MC group. MC gets exported as the 'core_siblings_list' in
/sys/devices/system/cpu/cpu*/topology/ and this caused CPUs with
the same 'physical_package_id' to not be listed together in
'core_siblings_list'. This violates a statement from
Documentation/ABI/testing/sysfs-devices-system-cpu:
core_siblings: internal kernel map of cpu#'s hardware threads
within the same physical_package_id.
core_siblings_list: human-readable list of the logical CPU
numbers within the same physical_package_id as cpu#.
The sysfs effects here cause an issue with the hwloc tool where
it gets confused and thinks there are more sockets than are
physically present.
Before this patch, there are two packages:
# cd /sys/devices/system/cpu/
# cat cpu*/topology/physical_package_id | sort | uniq -c
18 0
18 1
But 4 _sets_ of core siblings:
# cat cpu*/topology/core_siblings_list | sort | uniq -c
9 0-8
9 18-26
9 27-35
9 9-17
After this set, there are only 2 sets of core siblings, which
is what we expect for a 2-socket system.
# cat cpu*/topology/physical_package_id | sort | uniq -c
18 0
18 1
# cat cpu*/topology/core_siblings_list | sort | uniq -c
18 0-17
18 18-35
Example spew:
...
NMI watchdog: enabled on all CPUs, permanently consumes one hw-PMU counter.
#2 #3 #4 #5 #6 #7 #8
.... node #1, CPUs: #9
------------[ cut here ]------------
WARNING: CPU: 9 PID: 0 at /home/ak/hle/linux-hle-2.6/arch/x86/kernel/smpboot.c:306 topology_sane.isra.2+0x74/0x90()
sched: CPU #9's mc-sibling CPU #0 is not on the same node! [node: 1 != 0]. Ignoring dependency.
Modules linked in:
CPU: 9 PID: 0 Comm: swapper/9 Not tainted 3.17.0-rc1-00293-g8e01c4d-dirty #631
Hardware name: Intel Corporation S2600WTT/S2600WTT, BIOS GRNDSDP1.86B.0036.R05.1407140519 07/14/2014
0000000000000009 ffff88046ddabe00 ffffffff8172e485 ffff88046ddabe48
ffff88046ddabe38 ffffffff8109691d 000000000000b001 0000000000000009
ffff88086fc12580 000000000000b020 0000000000000009 ffff88046ddabe98
Call Trace:
[<ffffffff8172e485>] dump_stack+0x45/0x56
[<ffffffff8109691d>] warn_slowpath_common+0x7d/0xa0
[<ffffffff8109698c>] warn_slowpath_fmt+0x4c/0x50
[<ffffffff81074f94>] topology_sane.isra.2+0x74/0x90
[<ffffffff8107530e>] set_cpu_sibling_map+0x31e/0x4f0
[<ffffffff8107568d>] start_secondary+0x1ad/0x240
---[ end trace 3fe5f587a9fcde61 ]---
#10 #11 #12 #13 #14 #15 #16 #17
.... node #2, CPUs: #18 #19 #20 #21 #22 #23 #24 #25 #26
.... node #3, CPUs: #27 #28 #29 #30 #31 #32 #33 #34 #35
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
[ Added LLC domain and s/match_mc/match_die/ ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: brice.goglin@gmail.com
Cc: "H. Peter Anvin" <hpa@linux.intel.com>
Link: http://lkml.kernel.org/r/20140918193334.C065EBCE@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-09-19 03:33:34 +08:00
|
|
|
static bool
|
|
|
|
topology_same_node(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
|
|
|
|
{
|
|
|
|
int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
|
|
|
|
|
|
|
|
return (cpu_to_node(cpu1) == cpu_to_node(cpu2));
|
|
|
|
}
|
|
|
|
|
x86: delete __cpuinit usage from all x86 files
The __cpuinit type of throwaway sections might have made sense
some time ago when RAM was more constrained, but now the savings
do not offset the cost and complications. For example, the fix in
commit 5e427ec2d0 ("x86: Fix bit corruption at CPU resume time")
is a good example of the nasty type of bugs that can be created
with improper use of the various __init prefixes.
After a discussion on LKML[1] it was decided that cpuinit should go
the way of devinit and be phased out. Once all the users are gone,
we can then finally remove the macros themselves from linux/init.h.
Note that some harmless section mismatch warnings may result, since
notify_cpu_starting() and cpu_up() are arch independent (kernel/cpu.c)
are flagged as __cpuinit -- so if we remove the __cpuinit from
arch specific callers, we will also get section mismatch warnings.
As an intermediate step, we intend to turn the linux/init.h cpuinit
content into no-ops as early as possible, since that will get rid
of these warnings. In any case, they are temporary and harmless.
This removes all the arch/x86 uses of the __cpuinit macros from
all C files. x86 only had the one __CPUINIT used in assembly files,
and it wasn't paired off with a .previous or a __FINIT, so we can
delete it directly w/o any corresponding additional change there.
[1] https://lkml.org/lkml/2013/5/20/589
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2013-06-19 06:23:59 +08:00
|
|
|
static bool
|
2012-05-11 19:05:59 +08:00
|
|
|
topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)
|
2010-09-30 20:41:56 +08:00
|
|
|
{
|
2012-05-11 19:05:59 +08:00
|
|
|
int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
|
|
|
|
|
x86, sched: Add new topology for multi-NUMA-node CPUs
I'm getting the spew below when booting with Haswell (Xeon
E5-2699 v3) CPUs and the "Cluster-on-Die" (CoD) feature enabled
in the BIOS. It seems similar to the issue that some folks from
AMD ran in to on their systems and addressed in this commit:
161270fc1f9d ("x86/smp: Fix topology checks on AMD MCM CPUs")
Both these Intel and AMD systems break an assumption which is
being enforced by topology_sane(): a socket may not contain more
than one NUMA node.
AMD special-cased their system by looking for a cpuid flag. The
Intel mode is dependent on BIOS options and I do not know of a
way which it is enumerated other than the tables being parsed
during the CPU bringup process. In other words, we have to trust
the ACPI tables <shudder>.
This detects the situation where a NUMA node occurs at a place in
the middle of the "CPU" sched domains. It replaces the default
topology with one that relies on the NUMA information from the
firmware (SRAT table) for all levels of sched domains above the
hyperthreads.
This also fixes a sysfs bug. We used to freak out when we saw
the "mc" group cross a node boundary, so we stopped building the
MC group. MC gets exported as the 'core_siblings_list' in
/sys/devices/system/cpu/cpu*/topology/ and this caused CPUs with
the same 'physical_package_id' to not be listed together in
'core_siblings_list'. This violates a statement from
Documentation/ABI/testing/sysfs-devices-system-cpu:
core_siblings: internal kernel map of cpu#'s hardware threads
within the same physical_package_id.
core_siblings_list: human-readable list of the logical CPU
numbers within the same physical_package_id as cpu#.
The sysfs effects here cause an issue with the hwloc tool where
it gets confused and thinks there are more sockets than are
physically present.
Before this patch, there are two packages:
# cd /sys/devices/system/cpu/
# cat cpu*/topology/physical_package_id | sort | uniq -c
18 0
18 1
But 4 _sets_ of core siblings:
# cat cpu*/topology/core_siblings_list | sort | uniq -c
9 0-8
9 18-26
9 27-35
9 9-17
After this set, there are only 2 sets of core siblings, which
is what we expect for a 2-socket system.
# cat cpu*/topology/physical_package_id | sort | uniq -c
18 0
18 1
# cat cpu*/topology/core_siblings_list | sort | uniq -c
18 0-17
18 18-35
Example spew:
...
NMI watchdog: enabled on all CPUs, permanently consumes one hw-PMU counter.
#2 #3 #4 #5 #6 #7 #8
.... node #1, CPUs: #9
------------[ cut here ]------------
WARNING: CPU: 9 PID: 0 at /home/ak/hle/linux-hle-2.6/arch/x86/kernel/smpboot.c:306 topology_sane.isra.2+0x74/0x90()
sched: CPU #9's mc-sibling CPU #0 is not on the same node! [node: 1 != 0]. Ignoring dependency.
Modules linked in:
CPU: 9 PID: 0 Comm: swapper/9 Not tainted 3.17.0-rc1-00293-g8e01c4d-dirty #631
Hardware name: Intel Corporation S2600WTT/S2600WTT, BIOS GRNDSDP1.86B.0036.R05.1407140519 07/14/2014
0000000000000009 ffff88046ddabe00 ffffffff8172e485 ffff88046ddabe48
ffff88046ddabe38 ffffffff8109691d 000000000000b001 0000000000000009
ffff88086fc12580 000000000000b020 0000000000000009 ffff88046ddabe98
Call Trace:
[<ffffffff8172e485>] dump_stack+0x45/0x56
[<ffffffff8109691d>] warn_slowpath_common+0x7d/0xa0
[<ffffffff8109698c>] warn_slowpath_fmt+0x4c/0x50
[<ffffffff81074f94>] topology_sane.isra.2+0x74/0x90
[<ffffffff8107530e>] set_cpu_sibling_map+0x31e/0x4f0
[<ffffffff8107568d>] start_secondary+0x1ad/0x240
---[ end trace 3fe5f587a9fcde61 ]---
#10 #11 #12 #13 #14 #15 #16 #17
.... node #2, CPUs: #18 #19 #20 #21 #22 #23 #24 #25 #26
.... node #3, CPUs: #27 #28 #29 #30 #31 #32 #33 #34 #35
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
[ Added LLC domain and s/match_mc/match_die/ ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: brice.goglin@gmail.com
Cc: "H. Peter Anvin" <hpa@linux.intel.com>
Link: http://lkml.kernel.org/r/20140918193334.C065EBCE@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-09-19 03:33:34 +08:00
|
|
|
return !WARN_ONCE(!topology_same_node(c, o),
|
2012-05-11 19:05:59 +08:00
|
|
|
"sched: CPU #%d's %s-sibling CPU #%d is not on the same node! "
|
|
|
|
"[node: %d != %d]. Ignoring dependency.\n",
|
|
|
|
cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
|
|
|
|
}
|
|
|
|
|
2015-05-26 21:11:35 +08:00
|
|
|
#define link_mask(mfunc, c1, c2) \
|
2012-05-11 19:05:59 +08:00
|
|
|
do { \
|
2015-05-26 21:11:35 +08:00
|
|
|
cpumask_set_cpu((c1), mfunc(c2)); \
|
|
|
|
cpumask_set_cpu((c2), mfunc(c1)); \
|
2012-05-11 19:05:59 +08:00
|
|
|
} while (0)
|
|
|
|
|
x86: delete __cpuinit usage from all x86 files
The __cpuinit type of throwaway sections might have made sense
some time ago when RAM was more constrained, but now the savings
do not offset the cost and complications. For example, the fix in
commit 5e427ec2d0 ("x86: Fix bit corruption at CPU resume time")
is a good example of the nasty type of bugs that can be created
with improper use of the various __init prefixes.
After a discussion on LKML[1] it was decided that cpuinit should go
the way of devinit and be phased out. Once all the users are gone,
we can then finally remove the macros themselves from linux/init.h.
Note that some harmless section mismatch warnings may result, since
notify_cpu_starting() and cpu_up() are arch independent (kernel/cpu.c)
are flagged as __cpuinit -- so if we remove the __cpuinit from
arch specific callers, we will also get section mismatch warnings.
As an intermediate step, we intend to turn the linux/init.h cpuinit
content into no-ops as early as possible, since that will get rid
of these warnings. In any case, they are temporary and harmless.
This removes all the arch/x86 uses of the __cpuinit macros from
all C files. x86 only had the one __CPUINIT used in assembly files,
and it wasn't paired off with a .previous or a __FINIT, so we can
delete it directly w/o any corresponding additional change there.
[1] https://lkml.org/lkml/2013/5/20/589
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2013-06-19 06:23:59 +08:00
|
|
|
static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
|
2012-05-11 19:05:59 +08:00
|
|
|
{
|
2015-12-07 17:39:41 +08:00
|
|
|
if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
|
2012-05-11 19:05:59 +08:00
|
|
|
int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
|
|
|
|
|
|
|
|
if (c->phys_proc_id == o->phys_proc_id &&
|
2019-05-14 01:58:45 +08:00
|
|
|
c->cpu_die_id == o->cpu_die_id &&
|
2017-02-05 18:50:21 +08:00
|
|
|
per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) {
|
|
|
|
if (c->cpu_core_id == o->cpu_core_id)
|
|
|
|
return topology_sane(c, o, "smt");
|
|
|
|
|
|
|
|
if ((c->cu_id != 0xff) &&
|
|
|
|
(o->cu_id != 0xff) &&
|
|
|
|
(c->cu_id == o->cu_id))
|
|
|
|
return topology_sane(c, o, "smt");
|
|
|
|
}
|
2012-05-11 19:05:59 +08:00
|
|
|
|
|
|
|
} else if (c->phys_proc_id == o->phys_proc_id &&
|
2019-05-14 01:58:45 +08:00
|
|
|
c->cpu_die_id == o->cpu_die_id &&
|
2012-05-11 19:05:59 +08:00
|
|
|
c->cpu_core_id == o->cpu_core_id) {
|
|
|
|
return topology_sane(c, o, "smt");
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2021-03-11 03:02:33 +08:00
|
|
|
static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
|
|
|
|
{
|
|
|
|
if (c->phys_proc_id == o->phys_proc_id &&
|
|
|
|
c->cpu_die_id == o->cpu_die_id)
|
|
|
|
return true;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2021-09-24 16:51:04 +08:00
|
|
|
static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
|
|
|
|
{
|
|
|
|
int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
|
|
|
|
|
2021-10-22 23:49:53 +08:00
|
|
|
/* If the arch didn't set up l2c_id, fall back to SMT */
|
2021-09-24 16:51:04 +08:00
|
|
|
if (per_cpu(cpu_l2c_id, cpu1) == BAD_APICID)
|
2021-10-22 23:49:53 +08:00
|
|
|
return match_smt(c, o);
|
2021-09-24 16:51:04 +08:00
|
|
|
|
|
|
|
/* Do not match if L2 cache id does not match: */
|
|
|
|
if (per_cpu(cpu_l2c_id, cpu1) != per_cpu(cpu_l2c_id, cpu2))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return topology_sane(c, o, "l2c");
|
|
|
|
}
|
|
|
|
|
x86,sched: Allow topologies where NUMA nodes share an LLC
Intel's Skylake Server CPUs have a different LLC topology than previous
generations. When in Sub-NUMA-Clustering (SNC) mode, the package is divided
into two "slices", each containing half the cores, half the LLC, and one
memory controller and each slice is enumerated to Linux as a NUMA
node. This is similar to how the cores and LLC were arranged for the
Cluster-On-Die (CoD) feature.
CoD allowed the same cache line to be present in each half of the LLC.
But, with SNC, each line is only ever present in *one* slice. This means
that the portion of the LLC *available* to a CPU depends on the data being
accessed:
Remote socket: entire package LLC is shared
Local socket->local slice: data goes into local slice LLC
Local socket->remote slice: data goes into remote-slice LLC. Slightly
higher latency than local slice LLC.
The biggest implication from this is that a process accessing all
NUMA-local memory only sees half the LLC capacity.
The CPU describes its cache hierarchy with the CPUID instruction. One of
the CPUID leaves enumerates the "logical processors sharing this
cache". This information is used for scheduling decisions so that tasks
move more freely between CPUs sharing the cache.
But, the CPUID for the SNC configuration discussed above enumerates the LLC
as being shared by the entire package. This is not 100% precise because the
entire cache is not usable by all accesses. But, it *is* the way the
hardware enumerates itself, and this is not likely to change.
The userspace visible impact of all the above is that the sysfs info
reports the entire LLC as being available to the entire package. As noted
above, this is not true for local socket accesses. This patch does not
correct the sysfs info. It is the same, pre and post patch.
The current code emits the following warning:
sched: CPU #3's llc-sibling CPU #0 is not on the same node! [node: 1 != 0]. Ignoring dependency.
The warning is coming from the topology_sane() check in smpboot.c because
the topology is not matching the expectations of the model for obvious
reasons.
To fix this, add a vendor and model specific check to never call
topology_sane() for these systems. Also, just like "Cluster-on-Die" disable
the "coregroup" sched_domain_topology_level and use NUMA information from
the SRAT alone.
This is OK at least on the hardware we are immediately concerned about
because the LLC sharing happens at both the slice and at the package level,
which are also NUMA boundaries.
Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: brice.goglin@gmail.com
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: "H. Peter Anvin" <hpa@linux.intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Link: https://lkml.kernel.org/r/20180407002130.GA18984@alison-desk.jf.intel.com
2018-04-07 08:21:30 +08:00
|
|
|
/*
|
2021-03-11 03:02:33 +08:00
|
|
|
* Unlike the other levels, we do not enforce keeping a
|
|
|
|
* multicore group inside a NUMA node. If this happens, we will
|
|
|
|
* discard the MC level of the topology later.
|
|
|
|
*/
|
|
|
|
static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
|
|
|
|
{
|
|
|
|
if (c->phys_proc_id == o->phys_proc_id)
|
|
|
|
return true;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Define intel_cod_cpu[] for Intel COD (Cluster-on-Die) CPUs.
|
x86,sched: Allow topologies where NUMA nodes share an LLC
Intel's Skylake Server CPUs have a different LLC topology than previous
generations. When in Sub-NUMA-Clustering (SNC) mode, the package is divided
into two "slices", each containing half the cores, half the LLC, and one
memory controller and each slice is enumerated to Linux as a NUMA
node. This is similar to how the cores and LLC were arranged for the
Cluster-On-Die (CoD) feature.
CoD allowed the same cache line to be present in each half of the LLC.
But, with SNC, each line is only ever present in *one* slice. This means
that the portion of the LLC *available* to a CPU depends on the data being
accessed:
Remote socket: entire package LLC is shared
Local socket->local slice: data goes into local slice LLC
Local socket->remote slice: data goes into remote-slice LLC. Slightly
higher latency than local slice LLC.
The biggest implication from this is that a process accessing all
NUMA-local memory only sees half the LLC capacity.
The CPU describes its cache hierarchy with the CPUID instruction. One of
the CPUID leaves enumerates the "logical processors sharing this
cache". This information is used for scheduling decisions so that tasks
move more freely between CPUs sharing the cache.
But, the CPUID for the SNC configuration discussed above enumerates the LLC
as being shared by the entire package. This is not 100% precise because the
entire cache is not usable by all accesses. But, it *is* the way the
hardware enumerates itself, and this is not likely to change.
The userspace visible impact of all the above is that the sysfs info
reports the entire LLC as being available to the entire package. As noted
above, this is not true for local socket accesses. This patch does not
correct the sysfs info. It is the same, pre and post patch.
The current code emits the following warning:
sched: CPU #3's llc-sibling CPU #0 is not on the same node! [node: 1 != 0]. Ignoring dependency.
The warning is coming from the topology_sane() check in smpboot.c because
the topology is not matching the expectations of the model for obvious
reasons.
To fix this, add a vendor and model specific check to never call
topology_sane() for these systems. Also, just like "Cluster-on-Die" disable
the "coregroup" sched_domain_topology_level and use NUMA information from
the SRAT alone.
This is OK at least on the hardware we are immediately concerned about
because the LLC sharing happens at both the slice and at the package level,
which are also NUMA boundaries.
Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: brice.goglin@gmail.com
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: "H. Peter Anvin" <hpa@linux.intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Link: https://lkml.kernel.org/r/20180407002130.GA18984@alison-desk.jf.intel.com
2018-04-07 08:21:30 +08:00
|
|
|
*
|
2021-03-11 03:02:33 +08:00
|
|
|
* Any Intel CPU that has multiple nodes per package and does not
|
|
|
|
* match intel_cod_cpu[] has the SNC (Sub-NUMA Cluster) topology.
|
x86,sched: Allow topologies where NUMA nodes share an LLC
Intel's Skylake Server CPUs have a different LLC topology than previous
generations. When in Sub-NUMA-Clustering (SNC) mode, the package is divided
into two "slices", each containing half the cores, half the LLC, and one
memory controller and each slice is enumerated to Linux as a NUMA
node. This is similar to how the cores and LLC were arranged for the
Cluster-On-Die (CoD) feature.
CoD allowed the same cache line to be present in each half of the LLC.
But, with SNC, each line is only ever present in *one* slice. This means
that the portion of the LLC *available* to a CPU depends on the data being
accessed:
Remote socket: entire package LLC is shared
Local socket->local slice: data goes into local slice LLC
Local socket->remote slice: data goes into remote-slice LLC. Slightly
higher latency than local slice LLC.
The biggest implication from this is that a process accessing all
NUMA-local memory only sees half the LLC capacity.
The CPU describes its cache hierarchy with the CPUID instruction. One of
the CPUID leaves enumerates the "logical processors sharing this
cache". This information is used for scheduling decisions so that tasks
move more freely between CPUs sharing the cache.
But, the CPUID for the SNC configuration discussed above enumerates the LLC
as being shared by the entire package. This is not 100% precise because the
entire cache is not usable by all accesses. But, it *is* the way the
hardware enumerates itself, and this is not likely to change.
The userspace visible impact of all the above is that the sysfs info
reports the entire LLC as being available to the entire package. As noted
above, this is not true for local socket accesses. This patch does not
correct the sysfs info. It is the same, pre and post patch.
The current code emits the following warning:
sched: CPU #3's llc-sibling CPU #0 is not on the same node! [node: 1 != 0]. Ignoring dependency.
The warning is coming from the topology_sane() check in smpboot.c because
the topology is not matching the expectations of the model for obvious
reasons.
To fix this, add a vendor and model specific check to never call
topology_sane() for these systems. Also, just like "Cluster-on-Die" disable
the "coregroup" sched_domain_topology_level and use NUMA information from
the SRAT alone.
This is OK at least on the hardware we are immediately concerned about
because the LLC sharing happens at both the slice and at the package level,
which are also NUMA boundaries.
Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: brice.goglin@gmail.com
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: "H. Peter Anvin" <hpa@linux.intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Link: https://lkml.kernel.org/r/20180407002130.GA18984@alison-desk.jf.intel.com
2018-04-07 08:21:30 +08:00
|
|
|
*
|
2021-03-11 03:02:33 +08:00
|
|
|
* When in SNC mode, these CPUs enumerate an LLC that is shared
|
|
|
|
* by multiple NUMA nodes. The LLC is shared for off-package data
|
|
|
|
* access but private to the NUMA node (half of the package) for
|
|
|
|
* on-package access. CPUID (the source of the information about
|
|
|
|
* the LLC) can only enumerate the cache as shared or unshared,
|
|
|
|
* but not this particular configuration.
|
x86,sched: Allow topologies where NUMA nodes share an LLC
Intel's Skylake Server CPUs have a different LLC topology than previous
generations. When in Sub-NUMA-Clustering (SNC) mode, the package is divided
into two "slices", each containing half the cores, half the LLC, and one
memory controller and each slice is enumerated to Linux as a NUMA
node. This is similar to how the cores and LLC were arranged for the
Cluster-On-Die (CoD) feature.
CoD allowed the same cache line to be present in each half of the LLC.
But, with SNC, each line is only ever present in *one* slice. This means
that the portion of the LLC *available* to a CPU depends on the data being
accessed:
Remote socket: entire package LLC is shared
Local socket->local slice: data goes into local slice LLC
Local socket->remote slice: data goes into remote-slice LLC. Slightly
higher latency than local slice LLC.
The biggest implication from this is that a process accessing all
NUMA-local memory only sees half the LLC capacity.
The CPU describes its cache hierarchy with the CPUID instruction. One of
the CPUID leaves enumerates the "logical processors sharing this
cache". This information is used for scheduling decisions so that tasks
move more freely between CPUs sharing the cache.
But, the CPUID for the SNC configuration discussed above enumerates the LLC
as being shared by the entire package. This is not 100% precise because the
entire cache is not usable by all accesses. But, it *is* the way the
hardware enumerates itself, and this is not likely to change.
The userspace visible impact of all the above is that the sysfs info
reports the entire LLC as being available to the entire package. As noted
above, this is not true for local socket accesses. This patch does not
correct the sysfs info. It is the same, pre and post patch.
The current code emits the following warning:
sched: CPU #3's llc-sibling CPU #0 is not on the same node! [node: 1 != 0]. Ignoring dependency.
The warning is coming from the topology_sane() check in smpboot.c because
the topology is not matching the expectations of the model for obvious
reasons.
To fix this, add a vendor and model specific check to never call
topology_sane() for these systems. Also, just like "Cluster-on-Die" disable
the "coregroup" sched_domain_topology_level and use NUMA information from
the SRAT alone.
This is OK at least on the hardware we are immediately concerned about
because the LLC sharing happens at both the slice and at the package level,
which are also NUMA boundaries.
Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: brice.goglin@gmail.com
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: "H. Peter Anvin" <hpa@linux.intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Link: https://lkml.kernel.org/r/20180407002130.GA18984@alison-desk.jf.intel.com
2018-04-07 08:21:30 +08:00
|
|
|
*/
|
|
|
|
|
2021-03-11 03:02:33 +08:00
|
|
|
static const struct x86_cpu_id intel_cod_cpu[] = {
|
|
|
|
X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, 0), /* COD */
|
|
|
|
X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, 0), /* COD */
|
|
|
|
X86_MATCH_INTEL_FAM6_MODEL(ANY, 1), /* SNC */
|
x86,sched: Allow topologies where NUMA nodes share an LLC
Intel's Skylake Server CPUs have a different LLC topology than previous
generations. When in Sub-NUMA-Clustering (SNC) mode, the package is divided
into two "slices", each containing half the cores, half the LLC, and one
memory controller and each slice is enumerated to Linux as a NUMA
node. This is similar to how the cores and LLC were arranged for the
Cluster-On-Die (CoD) feature.
CoD allowed the same cache line to be present in each half of the LLC.
But, with SNC, each line is only ever present in *one* slice. This means
that the portion of the LLC *available* to a CPU depends on the data being
accessed:
Remote socket: entire package LLC is shared
Local socket->local slice: data goes into local slice LLC
Local socket->remote slice: data goes into remote-slice LLC. Slightly
higher latency than local slice LLC.
The biggest implication from this is that a process accessing all
NUMA-local memory only sees half the LLC capacity.
The CPU describes its cache hierarchy with the CPUID instruction. One of
the CPUID leaves enumerates the "logical processors sharing this
cache". This information is used for scheduling decisions so that tasks
move more freely between CPUs sharing the cache.
But, the CPUID for the SNC configuration discussed above enumerates the LLC
as being shared by the entire package. This is not 100% precise because the
entire cache is not usable by all accesses. But, it *is* the way the
hardware enumerates itself, and this is not likely to change.
The userspace visible impact of all the above is that the sysfs info
reports the entire LLC as being available to the entire package. As noted
above, this is not true for local socket accesses. This patch does not
correct the sysfs info. It is the same, pre and post patch.
The current code emits the following warning:
sched: CPU #3's llc-sibling CPU #0 is not on the same node! [node: 1 != 0]. Ignoring dependency.
The warning is coming from the topology_sane() check in smpboot.c because
the topology is not matching the expectations of the model for obvious
reasons.
To fix this, add a vendor and model specific check to never call
topology_sane() for these systems. Also, just like "Cluster-on-Die" disable
the "coregroup" sched_domain_topology_level and use NUMA information from
the SRAT alone.
This is OK at least on the hardware we are immediately concerned about
because the LLC sharing happens at both the slice and at the package level,
which are also NUMA boundaries.
Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: brice.goglin@gmail.com
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: "H. Peter Anvin" <hpa@linux.intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Link: https://lkml.kernel.org/r/20180407002130.GA18984@alison-desk.jf.intel.com
2018-04-07 08:21:30 +08:00
|
|
|
{}
|
|
|
|
};
|
|
|
|
|
x86: delete __cpuinit usage from all x86 files
The __cpuinit type of throwaway sections might have made sense
some time ago when RAM was more constrained, but now the savings
do not offset the cost and complications. For example, the fix in
commit 5e427ec2d0 ("x86: Fix bit corruption at CPU resume time")
is a good example of the nasty type of bugs that can be created
with improper use of the various __init prefixes.
After a discussion on LKML[1] it was decided that cpuinit should go
the way of devinit and be phased out. Once all the users are gone,
we can then finally remove the macros themselves from linux/init.h.
Note that some harmless section mismatch warnings may result, since
notify_cpu_starting() and cpu_up() are arch independent (kernel/cpu.c)
are flagged as __cpuinit -- so if we remove the __cpuinit from
arch specific callers, we will also get section mismatch warnings.
As an intermediate step, we intend to turn the linux/init.h cpuinit
content into no-ops as early as possible, since that will get rid
of these warnings. In any case, they are temporary and harmless.
This removes all the arch/x86 uses of the __cpuinit macros from
all C files. x86 only had the one __CPUINIT used in assembly files,
and it wasn't paired off with a .previous or a __FINIT, so we can
delete it directly w/o any corresponding additional change there.
[1] https://lkml.org/lkml/2013/5/20/589
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2013-06-19 06:23:59 +08:00
|
|
|
static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
|
2012-05-11 19:05:59 +08:00
|
|
|
{
|
2021-03-11 03:02:33 +08:00
|
|
|
const struct x86_cpu_id *id = x86_match_cpu(intel_cod_cpu);
|
2012-05-11 19:05:59 +08:00
|
|
|
int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
|
2021-03-11 03:02:33 +08:00
|
|
|
bool intel_snc = id && id->driver_data;
|
2012-05-11 19:05:59 +08:00
|
|
|
|
x86,sched: Allow topologies where NUMA nodes share an LLC
Intel's Skylake Server CPUs have a different LLC topology than previous
generations. When in Sub-NUMA-Clustering (SNC) mode, the package is divided
into two "slices", each containing half the cores, half the LLC, and one
memory controller and each slice is enumerated to Linux as a NUMA
node. This is similar to how the cores and LLC were arranged for the
Cluster-On-Die (CoD) feature.
CoD allowed the same cache line to be present in each half of the LLC.
But, with SNC, each line is only ever present in *one* slice. This means
that the portion of the LLC *available* to a CPU depends on the data being
accessed:
Remote socket: entire package LLC is shared
Local socket->local slice: data goes into local slice LLC
Local socket->remote slice: data goes into remote-slice LLC. Slightly
higher latency than local slice LLC.
The biggest implication from this is that a process accessing all
NUMA-local memory only sees half the LLC capacity.
The CPU describes its cache hierarchy with the CPUID instruction. One of
the CPUID leaves enumerates the "logical processors sharing this
cache". This information is used for scheduling decisions so that tasks
move more freely between CPUs sharing the cache.
But, the CPUID for the SNC configuration discussed above enumerates the LLC
as being shared by the entire package. This is not 100% precise because the
entire cache is not usable by all accesses. But, it *is* the way the
hardware enumerates itself, and this is not likely to change.
The userspace visible impact of all the above is that the sysfs info
reports the entire LLC as being available to the entire package. As noted
above, this is not true for local socket accesses. This patch does not
correct the sysfs info. It is the same, pre and post patch.
The current code emits the following warning:
sched: CPU #3's llc-sibling CPU #0 is not on the same node! [node: 1 != 0]. Ignoring dependency.
The warning is coming from the topology_sane() check in smpboot.c because
the topology is not matching the expectations of the model for obvious
reasons.
To fix this, add a vendor and model specific check to never call
topology_sane() for these systems. Also, just like "Cluster-on-Die" disable
the "coregroup" sched_domain_topology_level and use NUMA information from
the SRAT alone.
This is OK at least on the hardware we are immediately concerned about
because the LLC sharing happens at both the slice and at the package level,
which are also NUMA boundaries.
Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: brice.goglin@gmail.com
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: "H. Peter Anvin" <hpa@linux.intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Link: https://lkml.kernel.org/r/20180407002130.GA18984@alison-desk.jf.intel.com
2018-04-07 08:21:30 +08:00
|
|
|
/* Do not match if we do not have a valid APICID for cpu: */
|
|
|
|
if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID)
|
|
|
|
return false;
|
2012-05-11 19:05:59 +08:00
|
|
|
|
x86,sched: Allow topologies where NUMA nodes share an LLC
Intel's Skylake Server CPUs have a different LLC topology than previous
generations. When in Sub-NUMA-Clustering (SNC) mode, the package is divided
into two "slices", each containing half the cores, half the LLC, and one
memory controller and each slice is enumerated to Linux as a NUMA
node. This is similar to how the cores and LLC were arranged for the
Cluster-On-Die (CoD) feature.
CoD allowed the same cache line to be present in each half of the LLC.
But, with SNC, each line is only ever present in *one* slice. This means
that the portion of the LLC *available* to a CPU depends on the data being
accessed:
Remote socket: entire package LLC is shared
Local socket->local slice: data goes into local slice LLC
Local socket->remote slice: data goes into remote-slice LLC. Slightly
higher latency than local slice LLC.
The biggest implication from this is that a process accessing all
NUMA-local memory only sees half the LLC capacity.
The CPU describes its cache hierarchy with the CPUID instruction. One of
the CPUID leaves enumerates the "logical processors sharing this
cache". This information is used for scheduling decisions so that tasks
move more freely between CPUs sharing the cache.
But, the CPUID for the SNC configuration discussed above enumerates the LLC
as being shared by the entire package. This is not 100% precise because the
entire cache is not usable by all accesses. But, it *is* the way the
hardware enumerates itself, and this is not likely to change.
The userspace visible impact of all the above is that the sysfs info
reports the entire LLC as being available to the entire package. As noted
above, this is not true for local socket accesses. This patch does not
correct the sysfs info. It is the same, pre and post patch.
The current code emits the following warning:
sched: CPU #3's llc-sibling CPU #0 is not on the same node! [node: 1 != 0]. Ignoring dependency.
The warning is coming from the topology_sane() check in smpboot.c because
the topology is not matching the expectations of the model for obvious
reasons.
To fix this, add a vendor and model specific check to never call
topology_sane() for these systems. Also, just like "Cluster-on-Die" disable
the "coregroup" sched_domain_topology_level and use NUMA information from
the SRAT alone.
This is OK at least on the hardware we are immediately concerned about
because the LLC sharing happens at both the slice and at the package level,
which are also NUMA boundaries.
Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: brice.goglin@gmail.com
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: "H. Peter Anvin" <hpa@linux.intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Link: https://lkml.kernel.org/r/20180407002130.GA18984@alison-desk.jf.intel.com
2018-04-07 08:21:30 +08:00
|
|
|
/* Do not match if LLC id does not match: */
|
|
|
|
if (per_cpu(cpu_llc_id, cpu1) != per_cpu(cpu_llc_id, cpu2))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Allow the SNC topology without warning. Return of false
|
|
|
|
* means 'c' does not share the LLC of 'o'. This will be
|
|
|
|
* reflected to userspace.
|
|
|
|
*/
|
2021-03-11 03:02:33 +08:00
|
|
|
if (match_pkg(c, o) && !topology_same_node(c, o) && intel_snc)
|
x86,sched: Allow topologies where NUMA nodes share an LLC
Intel's Skylake Server CPUs have a different LLC topology than previous
generations. When in Sub-NUMA-Clustering (SNC) mode, the package is divided
into two "slices", each containing half the cores, half the LLC, and one
memory controller and each slice is enumerated to Linux as a NUMA
node. This is similar to how the cores and LLC were arranged for the
Cluster-On-Die (CoD) feature.
CoD allowed the same cache line to be present in each half of the LLC.
But, with SNC, each line is only ever present in *one* slice. This means
that the portion of the LLC *available* to a CPU depends on the data being
accessed:
Remote socket: entire package LLC is shared
Local socket->local slice: data goes into local slice LLC
Local socket->remote slice: data goes into remote-slice LLC. Slightly
higher latency than local slice LLC.
The biggest implication from this is that a process accessing all
NUMA-local memory only sees half the LLC capacity.
The CPU describes its cache hierarchy with the CPUID instruction. One of
the CPUID leaves enumerates the "logical processors sharing this
cache". This information is used for scheduling decisions so that tasks
move more freely between CPUs sharing the cache.
But, the CPUID for the SNC configuration discussed above enumerates the LLC
as being shared by the entire package. This is not 100% precise because the
entire cache is not usable by all accesses. But, it *is* the way the
hardware enumerates itself, and this is not likely to change.
The userspace visible impact of all the above is that the sysfs info
reports the entire LLC as being available to the entire package. As noted
above, this is not true for local socket accesses. This patch does not
correct the sysfs info. It is the same, pre and post patch.
The current code emits the following warning:
sched: CPU #3's llc-sibling CPU #0 is not on the same node! [node: 1 != 0]. Ignoring dependency.
The warning is coming from the topology_sane() check in smpboot.c because
the topology is not matching the expectations of the model for obvious
reasons.
To fix this, add a vendor and model specific check to never call
topology_sane() for these systems. Also, just like "Cluster-on-Die" disable
the "coregroup" sched_domain_topology_level and use NUMA information from
the SRAT alone.
This is OK at least on the hardware we are immediately concerned about
because the LLC sharing happens at both the slice and at the package level,
which are also NUMA boundaries.
Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: brice.goglin@gmail.com
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: "H. Peter Anvin" <hpa@linux.intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Link: https://lkml.kernel.org/r/20180407002130.GA18984@alison-desk.jf.intel.com
2018-04-07 08:21:30 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
return topology_sane(c, o, "llc");
|
2010-09-30 20:41:56 +08:00
|
|
|
}
|
|
|
|
|
2019-05-14 01:58:56 +08:00
|
|
|
|
2016-11-23 04:23:57 +08:00
|
|
|
static inline int x86_sched_itmt_flags(void)
|
|
|
|
{
|
|
|
|
return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_SCHED_MC
|
|
|
|
static int x86_core_flags(void)
|
|
|
|
{
|
|
|
|
return cpu_core_flags() | x86_sched_itmt_flags();
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_SCHED_SMT
|
|
|
|
static int x86_smt_flags(void)
|
|
|
|
{
|
2023-04-07 04:31:46 +08:00
|
|
|
return cpu_smt_flags();
|
2016-11-23 04:23:57 +08:00
|
|
|
}
|
|
|
|
#endif
|
2021-09-24 16:51:04 +08:00
|
|
|
#ifdef CONFIG_SCHED_CLUSTER
|
|
|
|
static int x86_cluster_flags(void)
|
|
|
|
{
|
|
|
|
return cpu_cluster_flags() | x86_sched_itmt_flags();
|
|
|
|
}
|
|
|
|
#endif
|
2023-08-15 11:57:47 +08:00
|
|
|
|
|
|
|
static int x86_die_flags(void)
|
|
|
|
{
|
|
|
|
if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
|
|
|
|
return x86_sched_itmt_flags();
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2016-11-23 04:23:57 +08:00
|
|
|
|
2023-06-02 00:00:25 +08:00
|
|
|
/*
|
|
|
|
* Set if a package/die has multiple NUMA nodes inside.
|
|
|
|
* AMD Magny-Cours, Intel Cluster-on-Die, and Intel
|
|
|
|
* Sub-NUMA Clustering have this.
|
|
|
|
*/
|
|
|
|
static bool x86_has_numa_in_package;
|
2016-09-22 03:19:03 +08:00
|
|
|
|
2023-06-02 00:00:25 +08:00
|
|
|
static struct sched_domain_topology_level x86_topology[6];
|
|
|
|
|
|
|
|
static void __init build_sched_topology(void)
|
|
|
|
{
|
|
|
|
int i = 0;
|
2021-12-04 17:14:02 +08:00
|
|
|
|
2016-09-22 03:19:03 +08:00
|
|
|
#ifdef CONFIG_SCHED_SMT
|
2023-06-02 00:00:25 +08:00
|
|
|
x86_topology[i++] = (struct sched_domain_topology_level){
|
|
|
|
cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT)
|
|
|
|
};
|
2016-09-22 03:19:03 +08:00
|
|
|
#endif
|
2021-09-24 16:51:04 +08:00
|
|
|
#ifdef CONFIG_SCHED_CLUSTER
|
2023-07-08 20:43:45 +08:00
|
|
|
x86_topology[i++] = (struct sched_domain_topology_level){
|
|
|
|
cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS)
|
|
|
|
};
|
2021-09-24 16:51:04 +08:00
|
|
|
#endif
|
2016-09-22 03:19:03 +08:00
|
|
|
#ifdef CONFIG_SCHED_MC
|
2023-06-02 00:00:25 +08:00
|
|
|
x86_topology[i++] = (struct sched_domain_topology_level){
|
|
|
|
cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC)
|
|
|
|
};
|
2016-09-22 03:19:03 +08:00
|
|
|
#endif
|
2023-06-02 00:00:25 +08:00
|
|
|
/*
|
|
|
|
* When there is NUMA topology inside the package skip the DIE domain
|
|
|
|
* since the NUMA domains will auto-magically create the right spanning
|
|
|
|
* domains based on the SLIT.
|
|
|
|
*/
|
|
|
|
if (!x86_has_numa_in_package) {
|
|
|
|
x86_topology[i++] = (struct sched_domain_topology_level){
|
2023-08-15 11:57:47 +08:00
|
|
|
cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(DIE)
|
2023-06-02 00:00:25 +08:00
|
|
|
};
|
|
|
|
}
|
2016-09-22 03:19:03 +08:00
|
|
|
|
2023-06-02 00:00:25 +08:00
|
|
|
/*
|
|
|
|
* There must be one trailing NULL entry left.
|
|
|
|
*/
|
|
|
|
BUG_ON(i >= ARRAY_SIZE(x86_topology)-1);
|
|
|
|
|
|
|
|
set_sched_topology(x86_topology);
|
|
|
|
}
|
x86, sched: Add new topology for multi-NUMA-node CPUs
I'm getting the spew below when booting with Haswell (Xeon
E5-2699 v3) CPUs and the "Cluster-on-Die" (CoD) feature enabled
in the BIOS. It seems similar to the issue that some folks from
AMD ran in to on their systems and addressed in this commit:
161270fc1f9d ("x86/smp: Fix topology checks on AMD MCM CPUs")
Both these Intel and AMD systems break an assumption which is
being enforced by topology_sane(): a socket may not contain more
than one NUMA node.
AMD special-cased their system by looking for a cpuid flag. The
Intel mode is dependent on BIOS options and I do not know of a
way which it is enumerated other than the tables being parsed
during the CPU bringup process. In other words, we have to trust
the ACPI tables <shudder>.
This detects the situation where a NUMA node occurs at a place in
the middle of the "CPU" sched domains. It replaces the default
topology with one that relies on the NUMA information from the
firmware (SRAT table) for all levels of sched domains above the
hyperthreads.
This also fixes a sysfs bug. We used to freak out when we saw
the "mc" group cross a node boundary, so we stopped building the
MC group. MC gets exported as the 'core_siblings_list' in
/sys/devices/system/cpu/cpu*/topology/ and this caused CPUs with
the same 'physical_package_id' to not be listed together in
'core_siblings_list'. This violates a statement from
Documentation/ABI/testing/sysfs-devices-system-cpu:
core_siblings: internal kernel map of cpu#'s hardware threads
within the same physical_package_id.
core_siblings_list: human-readable list of the logical CPU
numbers within the same physical_package_id as cpu#.
The sysfs effects here cause an issue with the hwloc tool where
it gets confused and thinks there are more sockets than are
physically present.
Before this patch, there are two packages:
# cd /sys/devices/system/cpu/
# cat cpu*/topology/physical_package_id | sort | uniq -c
18 0
18 1
But 4 _sets_ of core siblings:
# cat cpu*/topology/core_siblings_list | sort | uniq -c
9 0-8
9 18-26
9 27-35
9 9-17
After this set, there are only 2 sets of core siblings, which
is what we expect for a 2-socket system.
# cat cpu*/topology/physical_package_id | sort | uniq -c
18 0
18 1
# cat cpu*/topology/core_siblings_list | sort | uniq -c
18 0-17
18 18-35
Example spew:
...
NMI watchdog: enabled on all CPUs, permanently consumes one hw-PMU counter.
#2 #3 #4 #5 #6 #7 #8
.... node #1, CPUs: #9
------------[ cut here ]------------
WARNING: CPU: 9 PID: 0 at /home/ak/hle/linux-hle-2.6/arch/x86/kernel/smpboot.c:306 topology_sane.isra.2+0x74/0x90()
sched: CPU #9's mc-sibling CPU #0 is not on the same node! [node: 1 != 0]. Ignoring dependency.
Modules linked in:
CPU: 9 PID: 0 Comm: swapper/9 Not tainted 3.17.0-rc1-00293-g8e01c4d-dirty #631
Hardware name: Intel Corporation S2600WTT/S2600WTT, BIOS GRNDSDP1.86B.0036.R05.1407140519 07/14/2014
0000000000000009 ffff88046ddabe00 ffffffff8172e485 ffff88046ddabe48
ffff88046ddabe38 ffffffff8109691d 000000000000b001 0000000000000009
ffff88086fc12580 000000000000b020 0000000000000009 ffff88046ddabe98
Call Trace:
[<ffffffff8172e485>] dump_stack+0x45/0x56
[<ffffffff8109691d>] warn_slowpath_common+0x7d/0xa0
[<ffffffff8109698c>] warn_slowpath_fmt+0x4c/0x50
[<ffffffff81074f94>] topology_sane.isra.2+0x74/0x90
[<ffffffff8107530e>] set_cpu_sibling_map+0x31e/0x4f0
[<ffffffff8107568d>] start_secondary+0x1ad/0x240
---[ end trace 3fe5f587a9fcde61 ]---
#10 #11 #12 #13 #14 #15 #16 #17
.... node #2, CPUs: #18 #19 #20 #21 #22 #23 #24 #25 #26
.... node #3, CPUs: #27 #28 #29 #30 #31 #32 #33 #34 #35
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
[ Added LLC domain and s/match_mc/match_die/ ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: brice.goglin@gmail.com
Cc: "H. Peter Anvin" <hpa@linux.intel.com>
Link: http://lkml.kernel.org/r/20140918193334.C065EBCE@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-09-19 03:33:34 +08:00
|
|
|
|
x86: delete __cpuinit usage from all x86 files
The __cpuinit type of throwaway sections might have made sense
some time ago when RAM was more constrained, but now the savings
do not offset the cost and complications. For example, the fix in
commit 5e427ec2d0 ("x86: Fix bit corruption at CPU resume time")
is a good example of the nasty type of bugs that can be created
with improper use of the various __init prefixes.
After a discussion on LKML[1] it was decided that cpuinit should go
the way of devinit and be phased out. Once all the users are gone,
we can then finally remove the macros themselves from linux/init.h.
Note that some harmless section mismatch warnings may result, since
notify_cpu_starting() and cpu_up() are arch independent (kernel/cpu.c)
are flagged as __cpuinit -- so if we remove the __cpuinit from
arch specific callers, we will also get section mismatch warnings.
As an intermediate step, we intend to turn the linux/init.h cpuinit
content into no-ops as early as possible, since that will get rid
of these warnings. In any case, they are temporary and harmless.
This removes all the arch/x86 uses of the __cpuinit macros from
all C files. x86 only had the one __CPUINIT used in assembly files,
and it wasn't paired off with a .previous or a __FINIT, so we can
delete it directly w/o any corresponding additional change there.
[1] https://lkml.org/lkml/2013/5/20/589
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2013-06-19 06:23:59 +08:00
|
|
|
void set_cpu_sibling_map(int cpu)
|
2008-03-04 01:13:02 +08:00
|
|
|
{
|
2012-05-11 19:05:59 +08:00
|
|
|
bool has_smt = smp_num_siblings > 1;
|
2013-05-29 20:48:15 +08:00
|
|
|
bool has_mp = has_smt || boot_cpu_data.x86_max_cores > 1;
|
2008-03-04 01:13:02 +08:00
|
|
|
struct cpuinfo_x86 *c = &cpu_data(cpu);
|
2012-05-11 19:05:59 +08:00
|
|
|
struct cpuinfo_x86 *o;
|
2016-05-20 08:09:55 +08:00
|
|
|
int i, threads;
|
2008-03-04 01:13:02 +08:00
|
|
|
|
2009-01-04 21:18:03 +08:00
|
|
|
cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
|
2008-03-04 01:13:02 +08:00
|
|
|
|
2013-05-29 20:48:15 +08:00
|
|
|
if (!has_mp) {
|
2015-05-26 21:11:35 +08:00
|
|
|
cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
|
2012-05-11 19:05:59 +08:00
|
|
|
cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
|
2021-09-24 16:51:04 +08:00
|
|
|
cpumask_set_cpu(cpu, cpu_l2c_shared_mask(cpu));
|
2015-05-26 21:11:35 +08:00
|
|
|
cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
|
2019-05-14 01:58:56 +08:00
|
|
|
cpumask_set_cpu(cpu, topology_die_cpumask(cpu));
|
2008-03-04 01:13:02 +08:00
|
|
|
c->booted_cores = 1;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2009-01-04 21:18:03 +08:00
|
|
|
for_each_cpu(i, cpu_sibling_setup_mask) {
|
2012-05-11 19:05:59 +08:00
|
|
|
o = &cpu_data(i);
|
|
|
|
|
2021-03-11 03:02:33 +08:00
|
|
|
if (match_pkg(c, o) && !topology_same_node(c, o))
|
|
|
|
x86_has_numa_in_package = true;
|
|
|
|
|
2012-05-11 19:05:59 +08:00
|
|
|
if ((i == cpu) || (has_smt && match_smt(c, o)))
|
2015-05-26 21:11:35 +08:00
|
|
|
link_mask(topology_sibling_cpumask, cpu, i);
|
2012-05-11 19:05:59 +08:00
|
|
|
|
2013-05-29 20:48:15 +08:00
|
|
|
if ((i == cpu) || (has_mp && match_llc(c, o)))
|
2015-05-26 21:11:35 +08:00
|
|
|
link_mask(cpu_llc_shared_mask, cpu, i);
|
2012-05-11 19:05:59 +08:00
|
|
|
|
2021-09-24 16:51:04 +08:00
|
|
|
if ((i == cpu) || (has_mp && match_l2c(c, o)))
|
|
|
|
link_mask(cpu_l2c_shared_mask, cpu, i);
|
|
|
|
|
2021-03-11 03:02:33 +08:00
|
|
|
if ((i == cpu) || (has_mp && match_die(c, o)))
|
|
|
|
link_mask(topology_die_cpumask, cpu, i);
|
2012-05-31 15:37:38 +08:00
|
|
|
}
|
|
|
|
|
2021-03-11 03:02:33 +08:00
|
|
|
threads = cpumask_weight(topology_sibling_cpumask(cpu));
|
|
|
|
if (threads > __max_smt_threads)
|
|
|
|
__max_smt_threads = threads;
|
|
|
|
|
2021-01-08 20:10:52 +08:00
|
|
|
for_each_cpu(i, topology_sibling_cpumask(cpu))
|
|
|
|
cpu_data(i).smt_active = threads > 1;
|
|
|
|
|
2012-05-31 15:37:38 +08:00
|
|
|
/*
|
|
|
|
* This needs a separate iteration over the cpus because we rely on all
|
2015-05-26 21:11:35 +08:00
|
|
|
* topology_sibling_cpumask links to be set-up.
|
2012-05-31 15:37:38 +08:00
|
|
|
*/
|
|
|
|
for_each_cpu(i, cpu_sibling_setup_mask) {
|
|
|
|
o = &cpu_data(i);
|
|
|
|
|
2019-02-26 14:20:01 +08:00
|
|
|
if ((i == cpu) || (has_mp && match_pkg(c, o))) {
|
2015-05-26 21:11:35 +08:00
|
|
|
link_mask(topology_core_cpumask, cpu, i);
|
2012-05-11 19:05:59 +08:00
|
|
|
|
2008-03-04 01:13:02 +08:00
|
|
|
/*
|
|
|
|
* Does this new cpu bringup a new core?
|
|
|
|
*/
|
2021-03-11 03:02:33 +08:00
|
|
|
if (threads == 1) {
|
2008-03-04 01:13:02 +08:00
|
|
|
/*
|
|
|
|
* for each core in package, increment
|
|
|
|
* the booted_cores for this new cpu
|
|
|
|
*/
|
2015-05-26 21:11:35 +08:00
|
|
|
if (cpumask_first(
|
|
|
|
topology_sibling_cpumask(i)) == i)
|
2008-03-04 01:13:02 +08:00
|
|
|
c->booted_cores++;
|
|
|
|
/*
|
|
|
|
* increment the core count for all
|
|
|
|
* the other cpus in this package
|
|
|
|
*/
|
|
|
|
if (i != cpu)
|
|
|
|
cpu_data(i).booted_cores++;
|
|
|
|
} else if (i != cpu && !c->booted_cores)
|
|
|
|
c->booted_cores = cpu_data(i).booted_cores;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-03-04 01:13:03 +08:00
|
|
|
/* maps the cpu to the sched domain representing multi-core */
|
2008-12-26 19:53:41 +08:00
|
|
|
const struct cpumask *cpu_coregroup_mask(int cpu)
|
2008-03-04 01:13:03 +08:00
|
|
|
{
|
2012-05-29 22:39:09 +08:00
|
|
|
return cpu_llc_shared_mask(cpu);
|
2008-12-26 19:53:41 +08:00
|
|
|
}
|
|
|
|
|
2021-09-24 16:51:04 +08:00
|
|
|
const struct cpumask *cpu_clustergroup_mask(int cpu)
|
|
|
|
{
|
|
|
|
return cpu_l2c_shared_mask(cpu);
|
|
|
|
}
|
2023-11-16 22:22:43 +08:00
|
|
|
EXPORT_SYMBOL_GPL(cpu_clustergroup_mask);
|
2021-09-24 16:51:04 +08:00
|
|
|
|
2008-04-23 19:20:56 +08:00
|
|
|
static void impress_friends(void)
|
2008-03-20 01:25:27 +08:00
|
|
|
{
|
|
|
|
int cpu;
|
|
|
|
unsigned long bogosum = 0;
|
|
|
|
/*
|
|
|
|
* Allow the user to impress friends.
|
|
|
|
*/
|
2012-05-22 10:50:07 +08:00
|
|
|
pr_debug("Before bogomips\n");
|
2023-05-13 05:07:29 +08:00
|
|
|
for_each_online_cpu(cpu)
|
|
|
|
bogosum += cpu_data(cpu).loops_per_jiffy;
|
|
|
|
|
2012-05-22 10:50:07 +08:00
|
|
|
pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n",
|
2008-03-20 01:25:29 +08:00
|
|
|
num_online_cpus(),
|
2008-03-20 01:25:27 +08:00
|
|
|
bogosum/(500000/HZ),
|
|
|
|
(bogosum/(5000/HZ))%100);
|
|
|
|
|
2012-05-22 10:50:07 +08:00
|
|
|
pr_debug("Before bogocount - setting activated=1\n");
|
2008-03-20 01:25:27 +08:00
|
|
|
}
|
|
|
|
|
2015-05-12 05:27:09 +08:00
|
|
|
/*
|
|
|
|
* The Multiprocessor Specification 1.4 (1997) example code suggests
|
|
|
|
* that there should be a 10ms delay between the BSP asserting INIT
|
|
|
|
* and de-asserting INIT, when starting a remote processor.
|
|
|
|
* But that slows boot and resume on modern processors, which include
|
|
|
|
* many cores and don't require that delay.
|
|
|
|
*
|
|
|
|
* Cmdline "init_cpu_udelay=" is available to over-ride this delay.
|
2015-05-12 05:27:10 +08:00
|
|
|
* Modern processor families are quirked to remove the delay entirely.
|
2015-05-12 05:27:09 +08:00
|
|
|
*/
|
|
|
|
#define UDELAY_10MS_DEFAULT 10000
|
|
|
|
|
2015-11-23 07:16:15 +08:00
|
|
|
static unsigned int init_udelay = UINT_MAX;
|
2015-05-12 05:27:09 +08:00
|
|
|
|
|
|
|
static int __init cpu_init_udelay(char *str)
|
|
|
|
{
|
|
|
|
get_option(&str, &init_udelay);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
early_param("cpu_init_udelay", cpu_init_udelay);
|
|
|
|
|
2015-05-12 05:27:10 +08:00
|
|
|
static void __init smp_quirk_init_udelay(void)
|
|
|
|
{
|
|
|
|
/* if cmdline changed it from default, leave it alone */
|
2015-11-23 07:16:15 +08:00
|
|
|
if (init_udelay != UINT_MAX)
|
2015-05-12 05:27:10 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
/* if modern processor, use no delay */
|
|
|
|
if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) ||
|
2018-09-23 17:34:32 +08:00
|
|
|
((boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) && (boot_cpu_data.x86 >= 0x18)) ||
|
2015-11-23 07:16:15 +08:00
|
|
|
((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) {
|
2015-05-12 05:27:10 +08:00
|
|
|
init_udelay = 0;
|
2015-11-23 07:16:15 +08:00
|
|
|
return;
|
|
|
|
}
|
2015-10-16 12:14:28 +08:00
|
|
|
/* else, use legacy delay */
|
|
|
|
init_udelay = UDELAY_10MS_DEFAULT;
|
2015-05-12 05:27:10 +08:00
|
|
|
}
|
|
|
|
|
2008-03-20 01:25:59 +08:00
|
|
|
/*
|
2023-05-13 05:07:06 +08:00
|
|
|
* Wake up AP by INIT, INIT, STARTUP sequence.
|
2008-03-20 01:25:59 +08:00
|
|
|
*/
|
2023-06-16 04:33:58 +08:00
|
|
|
static void send_init_sequence(int phys_apicid)
|
2008-03-20 01:25:59 +08:00
|
|
|
{
|
2023-06-16 04:33:58 +08:00
|
|
|
int maxlvt = lapic_get_maxlvt();
|
x86: APIC: remove apic_write_around(); use alternatives
Use alternatives to select the workaround for the 11AP Pentium erratum
for the affected steppings on the fly rather than build time. Remove the
X86_GOOD_APIC configuration option and replace all the calls to
apic_write_around() with plain apic_write(), protecting accesses to the
ESR as appropriate due to the 3AP Pentium erratum. Remove
apic_read_around() and all its invocations altogether as not needed.
Remove apic_write_atomic() and all its implementing backends. The use of
ASM_OUTPUT2() is not strictly needed for input constraints, but I have
used it for readability's sake.
I had the feeling no one else was brave enough to do it, so I went ahead
and here it is. Verified by checking the generated assembly and tested
with both a 32-bit and a 64-bit configuration, also with the 11AP
"feature" forced on and verified with gdb on /proc/kcore to work as
expected (as an 11AP machines are quite hard to get hands on these days).
Some script complained about the use of "volatile", but apic_write() needs
it for the same reason and is effectively a replacement for writel(), so I
have disregarded it.
I am not sure what the policy wrt defconfig files is, they are generated
and there is risk of a conflict resulting from an unrelated change, so I
have left changes to them out. The option will get removed from them at
the next run.
Some testing with machines other than mine will be needed to avoid some
stupid mistake, but despite its volume, the change is not really that
intrusive, so I am fairly confident that because it works for me, it will
everywhere.
Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-07-17 02:15:30 +08:00
|
|
|
|
2023-06-16 04:33:58 +08:00
|
|
|
/* Be paranoid about clearing APIC errors. */
|
2016-09-14 02:12:32 +08:00
|
|
|
if (APIC_INTEGRATED(boot_cpu_apic_version)) {
|
2023-06-16 04:33:58 +08:00
|
|
|
/* Due to the Pentium erratum 3AP. */
|
|
|
|
if (maxlvt > 3)
|
x86: APIC: remove apic_write_around(); use alternatives
Use alternatives to select the workaround for the 11AP Pentium erratum
for the affected steppings on the fly rather than build time. Remove the
X86_GOOD_APIC configuration option and replace all the calls to
apic_write_around() with plain apic_write(), protecting accesses to the
ESR as appropriate due to the 3AP Pentium erratum. Remove
apic_read_around() and all its invocations altogether as not needed.
Remove apic_write_atomic() and all its implementing backends. The use of
ASM_OUTPUT2() is not strictly needed for input constraints, but I have
used it for readability's sake.
I had the feeling no one else was brave enough to do it, so I went ahead
and here it is. Verified by checking the generated assembly and tested
with both a 32-bit and a 64-bit configuration, also with the 11AP
"feature" forced on and verified with gdb on /proc/kcore to work as
expected (as an 11AP machines are quite hard to get hands on these days).
Some script complained about the use of "volatile", but apic_write() needs
it for the same reason and is effectively a replacement for writel(), so I
have disregarded it.
I am not sure what the policy wrt defconfig files is, they are generated
and there is risk of a conflict resulting from an unrelated change, so I
have left changes to them out. The option will get removed from them at
the next run.
Some testing with machines other than mine will be needed to avoid some
stupid mistake, but despite its volume, the change is not really that
intrusive, so I am fairly confident that because it works for me, it will
everywhere.
Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-07-17 02:15:30 +08:00
|
|
|
apic_write(APIC_ESR, 0);
|
2008-03-20 01:25:59 +08:00
|
|
|
apic_read(APIC_ESR);
|
|
|
|
}
|
|
|
|
|
2023-06-16 04:33:58 +08:00
|
|
|
/* Assert INIT on the target CPU */
|
|
|
|
apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT, phys_apicid);
|
|
|
|
safe_apic_wait_icr_idle();
|
2008-03-20 01:25:59 +08:00
|
|
|
|
x86/smp/boot: Fix legacy SMP bootup slow-boot bug
So while testing kernels using tools/kvm/ (kvmtool) I noticed that it
booted super slow:
[ 0.142991] Performance Events: no PMU driver, software events only.
[ 0.149265] x86: Booting SMP configuration:
[ 0.149765] .... node #0, CPUs: #1
[ 0.148304] kvm-clock: cpu 1, msr 2:1bfe9041, secondary cpu clock
[ 10.158813] KVM setup async PF for cpu 1
[ 10.159000] #2
[ 10.159000] kvm-stealtime: cpu 1, msr 211a4d400
[ 10.158829] kvm-clock: cpu 2, msr 2:1bfe9081, secondary cpu clock
[ 20.167805] KVM setup async PF for cpu 2
[ 20.168000] #3
[ 20.168000] kvm-stealtime: cpu 2, msr 211a8d400
[ 20.167818] kvm-clock: cpu 3, msr 2:1bfe90c1, secondary cpu clock
[ 30.176902] KVM setup async PF for cpu 3
[ 30.177000] #4
[ 30.177000] kvm-stealtime: cpu 3, msr 211acd400
One CPU booted up per 10 seconds. With 120 CPUs that takes a while.
Bisection pinpointed this commit:
853b160aaafb ("Revert f5d6a52f5111 ("x86/smpboot: Skip delays during SMP initialization similar to Xen")")
But that commit just restores previous behavior, so it cannot cause the
problem. After some head scratching it turns out that these two commits:
1a744cb356c5 ("x86/smp/boot: Remove 10ms delay from cpu_up() on modern processors")
d68921f9bd14 ("x86/smp/boot: Add cmdline "cpu_init_udelay=N" to specify cpu_up() delay")
added the following code to smpboot.c:
- mdelay(10);
+ mdelay(init_udelay);
Note the mismatch in the units: the delay is called 'udelay' and is set
to microseconds - while the function used here is actually 'mdelay',
which counts in milliseconds ...
So the delay for legacy systems is off by a factor of 1,000, so instead
of 10 msecs we waited for 10 seconds ...
The reason bisection pointed to 853b160aaafb was that 853b160aaafb removed
a (broken) boot-time speedup patch, which masked the factor 1,000 bug.
Fix it by using udelay(). This fixes my bootup problems.
Cc: Len Brown <len.brown@intel.com>
Cc: Alan Cox <alan@linux.intel.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jan H. Schönherr <jschoenh@amazon.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-05-18 18:05:13 +08:00
|
|
|
udelay(init_udelay);
|
2008-03-20 01:25:59 +08:00
|
|
|
|
2023-06-16 04:33:58 +08:00
|
|
|
/* Deassert INIT on the target CPU */
|
2008-07-11 02:16:49 +08:00
|
|
|
apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
|
2023-06-16 04:33:58 +08:00
|
|
|
safe_apic_wait_icr_idle();
|
|
|
|
}
|
2008-03-20 01:25:59 +08:00
|
|
|
|
2023-06-16 04:33:58 +08:00
|
|
|
/*
|
|
|
|
* Wake up AP by INIT, INIT, STARTUP sequence.
|
|
|
|
*/
|
|
|
|
static int wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
|
|
|
|
{
|
|
|
|
unsigned long send_status = 0, accept_status = 0;
|
A set of fixes for kexec(), reboot and shutdown issues
- Ensure that the WBINVD in stop_this_cpu() has been completed before the
control CPU proceedes.
stop_this_cpu() is used for kexec(), reboot and shutdown to park the APs
in a HLT loop.
The control CPU sends an IPI to the APs and waits for their CPU online bits
to be cleared. Once they all are marked "offline" it proceeds.
But stop_this_cpu() clears the CPU online bit before issuing WBINVD,
which means there is no guarantee that the AP has reached the HLT loop.
This was reported to cause intermittent reboot/shutdown failures due to
some dubious interaction with the firmware.
This is not only a problem of WBINVD. The code to actually "stop" the
CPU which runs between clearing the online bit and reaching the HLT loop
can cause large enough delays on its own (think virtualization). That's
especially dangerous for kexec() as kexec() expects that all APs are in
a safe state and not executing code while the boot CPU jumps to the new
kernel. There are more issues vs. kexec() which are addressed separately.
Cure this by implementing an explicit synchronization point right before
the AP reaches HLT. This guarantees that the AP has completed the full
stop proceedure.
- Fix the condition for WBINVD in stop_this_cpu().
The WBINVD in stop_this_cpu() is required for ensuring that when
switching to or from memory encryption no dirty data is left in the
cache lines which might cause a write back in the wrong more later.
This checks CPUID directly because the feature bit might have been
cleared due to a command line option.
But that CPUID check accesses leaf 0x8000001f::EAX unconditionally. Intel
CPUs return the content of the highest supported leaf when a non-existing
leaf is read, while AMD CPUs return all zeros for unsupported leafs.
So the result of the test on Intel CPUs is lottery and on AMD its just
correct by chance.
While harmless it's incorrect and causes the conditional wbinvd() to be
issued where not required, which caused the above issue to be unearthed.
- Make kexec() robust against AP code execution
Ashok observed triple faults when doing kexec() on a system which had
been booted with "nosmt".
It turned out that the SMT siblings which had been brought up partially
are parked in mwait_play_dead() to enable power savings.
mwait_play_dead() is monitoring the thread flags of the AP's idle task,
which has been chosen as it's unlikely to be written to.
But kexec() can overwrite the previous kernel text and data including
page tables etc. When it overwrites the cache lines monitored by an AP
that AP resumes execution after the MWAIT on eventually overwritten
text, stack and page tables, which obviously might end up in a triple
fault easily.
Make this more robust in several steps:
1) Use an explicit per CPU cache line for monitoring.
2) Write a command to these cache lines to kick APs out of MWAIT before
proceeding with kexec(), shutdown or reboot.
The APs confirm the wakeup by writing status back and then enter a
HLT loop.
3) If the system uses INIT/INIT/STARTUP for AP bringup, park the APs
in INIT state.
HLT is not a guarantee that an AP won't wake up and resume
execution. HLT is woken up by NMI and SMI. SMI puts the CPU back
into HLT (+/- firmware bugs), but NMI is delivered to the CPU which
executes the NMI handler. Same issue as the MWAIT scenario described
above.
Sending an INIT/INIT sequence to the APs puts them into wait for
STARTUP state, which is safe against NMI.
There is still an issue remaining which can't be fixed: #MCE
If the AP sits in HLT and receives a broadcast #MCE it will try to
handle it with the obvious consequences.
INIT/INIT clears CR4.MCE in the AP which will cause a broadcast #MCE to
shut down the machine.
So there is a choice between fire (HLT) and frying pan (INIT). Frying
pan has been chosen as it's at least preventing the NMI issue.
On systems which are not using INIT/INIT/STARTUP there is not much
which can be done right now, but at least the obvious and easy to
trigger MWAIT issue has been addressed.
-----BEGIN PGP SIGNATURE-----
iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAmSZfpQTHHRnbHhAbGlu
dXRyb25peC5kZQAKCRCmGPVMDXSYoeZpD/9gSJN2qtGqoOgE8bWAenEeqppmBGFE
EAhuhsvN1qG9JosUFo4KzxsGD/aWt2P6XglBDrGti8mFNol67jutmwWklntL3/ZR
m8D6D+Pl7/CaDgACDTDbrnVC3lOGyMhD301yJrnBigS/SEoHeHI9UtadbHukuLQj
TlKt5KtAnap15bE6QL846cDIptB9SjYLLPULo3i4azXEis/l6eAkffwAR6dmKlBh
2RbhLK1xPPG9nqWYjqZXnex09acKwD9xY9xHj4+GampV4UqHJRWfW0YtFs5ENi01
r3FVCdKEcvMkUw0zh0IAviBRs2vCI/R3YSfEc7P0264yn5WzMhAT+OGCovNjByiW
sB4Iqa+Yf6aoBWwux6W4d22xu7uYhmFk/jiLyRZJPW/gvGZCZATT/x/T2hRoaYA8
3S0Rs7n/gbfvynQETgniifuM0bXRW0lEJAmn840GwyVQwlpDEPBJSwW4El49kbkc
+dHxnmpMCfnBxfVLS1YDd4WOmkWBeECNcW330FShlQQ8mM3UG31+Q8Jc55Ze9SW0
w1h+IgIOHlA0DpQUUM8DJTSuxFx2piQsZxjOtzd70+BiKZpCsHqVLIp4qfnf+/GO
gyP0cCQLbafpABbV9uVy8A/qgUGi0Qii0GJfCTy0OdmU+JX3C2C/gsM3uN0g3qAj
vUhkuCXEGL5k1w==
=KgZ0
-----END PGP SIGNATURE-----
Merge tag 'x86-core-2023-06-26' of ssh://gitolite.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 core updates from Thomas Gleixner:
"A set of fixes for kexec(), reboot and shutdown issues:
- Ensure that the WBINVD in stop_this_cpu() has been completed before
the control CPU proceedes.
stop_this_cpu() is used for kexec(), reboot and shutdown to park
the APs in a HLT loop.
The control CPU sends an IPI to the APs and waits for their CPU
online bits to be cleared. Once they all are marked "offline" it
proceeds.
But stop_this_cpu() clears the CPU online bit before issuing
WBINVD, which means there is no guarantee that the AP has reached
the HLT loop.
This was reported to cause intermittent reboot/shutdown failures
due to some dubious interaction with the firmware.
This is not only a problem of WBINVD. The code to actually "stop"
the CPU which runs between clearing the online bit and reaching the
HLT loop can cause large enough delays on its own (think
virtualization). That's especially dangerous for kexec() as kexec()
expects that all APs are in a safe state and not executing code
while the boot CPU jumps to the new kernel. There are more issues
vs kexec() which are addressed separately.
Cure this by implementing an explicit synchronization point right
before the AP reaches HLT. This guarantees that the AP has
completed the full stop proceedure.
- Fix the condition for WBINVD in stop_this_cpu().
The WBINVD in stop_this_cpu() is required for ensuring that when
switching to or from memory encryption no dirty data is left in the
cache lines which might cause a write back in the wrong more later.
This checks CPUID directly because the feature bit might have been
cleared due to a command line option.
But that CPUID check accesses leaf 0x8000001f::EAX unconditionally.
Intel CPUs return the content of the highest supported leaf when a
non-existing leaf is read, while AMD CPUs return all zeros for
unsupported leafs.
So the result of the test on Intel CPUs is lottery and on AMD its
just correct by chance.
While harmless it's incorrect and causes the conditional wbinvd()
to be issued where not required, which caused the above issue to be
unearthed.
- Make kexec() robust against AP code execution
Ashok observed triple faults when doing kexec() on a system which
had been booted with "nosmt".
It turned out that the SMT siblings which had been brought up
partially are parked in mwait_play_dead() to enable power savings.
mwait_play_dead() is monitoring the thread flags of the AP's idle
task, which has been chosen as it's unlikely to be written to.
But kexec() can overwrite the previous kernel text and data
including page tables etc. When it overwrites the cache lines
monitored by an AP that AP resumes execution after the MWAIT on
eventually overwritten text, stack and page tables, which obviously
might end up in a triple fault easily.
Make this more robust in several steps:
1) Use an explicit per CPU cache line for monitoring.
2) Write a command to these cache lines to kick APs out of MWAIT
before proceeding with kexec(), shutdown or reboot.
The APs confirm the wakeup by writing status back and then
enter a HLT loop.
3) If the system uses INIT/INIT/STARTUP for AP bringup, park the
APs in INIT state.
HLT is not a guarantee that an AP won't wake up and resume
execution. HLT is woken up by NMI and SMI. SMI puts the CPU
back into HLT (+/- firmware bugs), but NMI is delivered to the
CPU which executes the NMI handler. Same issue as the MWAIT
scenario described above.
Sending an INIT/INIT sequence to the APs puts them into wait
for STARTUP state, which is safe against NMI.
There is still an issue remaining which can't be fixed: #MCE
If the AP sits in HLT and receives a broadcast #MCE it will try to
handle it with the obvious consequences.
INIT/INIT clears CR4.MCE in the AP which will cause a broadcast
#MCE to shut down the machine.
So there is a choice between fire (HLT) and frying pan (INIT).
Frying pan has been chosen as it's at least preventing the NMI
issue.
On systems which are not using INIT/INIT/STARTUP there is not much
which can be done right now, but at least the obvious and easy to
trigger MWAIT issue has been addressed"
* tag 'x86-core-2023-06-26' of ssh://gitolite.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/smp: Put CPUs into INIT on shutdown if possible
x86/smp: Split sending INIT IPI out into a helper function
x86/smp: Cure kexec() vs. mwait_play_dead() breakage
x86/smp: Use dedicated cache-line for mwait_play_dead()
x86/smp: Remove pointless wmb()s from native_stop_other_cpus()
x86/smp: Dont access non-existing CPUID leaf
x86/smp: Make stop_other_cpus() more robust
2023-06-27 05:45:53 +08:00
|
|
|
int num_starts, j, maxlvt;
|
2023-06-16 04:33:58 +08:00
|
|
|
|
A set of fixes for kexec(), reboot and shutdown issues
- Ensure that the WBINVD in stop_this_cpu() has been completed before the
control CPU proceedes.
stop_this_cpu() is used for kexec(), reboot and shutdown to park the APs
in a HLT loop.
The control CPU sends an IPI to the APs and waits for their CPU online bits
to be cleared. Once they all are marked "offline" it proceeds.
But stop_this_cpu() clears the CPU online bit before issuing WBINVD,
which means there is no guarantee that the AP has reached the HLT loop.
This was reported to cause intermittent reboot/shutdown failures due to
some dubious interaction with the firmware.
This is not only a problem of WBINVD. The code to actually "stop" the
CPU which runs between clearing the online bit and reaching the HLT loop
can cause large enough delays on its own (think virtualization). That's
especially dangerous for kexec() as kexec() expects that all APs are in
a safe state and not executing code while the boot CPU jumps to the new
kernel. There are more issues vs. kexec() which are addressed separately.
Cure this by implementing an explicit synchronization point right before
the AP reaches HLT. This guarantees that the AP has completed the full
stop proceedure.
- Fix the condition for WBINVD in stop_this_cpu().
The WBINVD in stop_this_cpu() is required for ensuring that when
switching to or from memory encryption no dirty data is left in the
cache lines which might cause a write back in the wrong more later.
This checks CPUID directly because the feature bit might have been
cleared due to a command line option.
But that CPUID check accesses leaf 0x8000001f::EAX unconditionally. Intel
CPUs return the content of the highest supported leaf when a non-existing
leaf is read, while AMD CPUs return all zeros for unsupported leafs.
So the result of the test on Intel CPUs is lottery and on AMD its just
correct by chance.
While harmless it's incorrect and causes the conditional wbinvd() to be
issued where not required, which caused the above issue to be unearthed.
- Make kexec() robust against AP code execution
Ashok observed triple faults when doing kexec() on a system which had
been booted with "nosmt".
It turned out that the SMT siblings which had been brought up partially
are parked in mwait_play_dead() to enable power savings.
mwait_play_dead() is monitoring the thread flags of the AP's idle task,
which has been chosen as it's unlikely to be written to.
But kexec() can overwrite the previous kernel text and data including
page tables etc. When it overwrites the cache lines monitored by an AP
that AP resumes execution after the MWAIT on eventually overwritten
text, stack and page tables, which obviously might end up in a triple
fault easily.
Make this more robust in several steps:
1) Use an explicit per CPU cache line for monitoring.
2) Write a command to these cache lines to kick APs out of MWAIT before
proceeding with kexec(), shutdown or reboot.
The APs confirm the wakeup by writing status back and then enter a
HLT loop.
3) If the system uses INIT/INIT/STARTUP for AP bringup, park the APs
in INIT state.
HLT is not a guarantee that an AP won't wake up and resume
execution. HLT is woken up by NMI and SMI. SMI puts the CPU back
into HLT (+/- firmware bugs), but NMI is delivered to the CPU which
executes the NMI handler. Same issue as the MWAIT scenario described
above.
Sending an INIT/INIT sequence to the APs puts them into wait for
STARTUP state, which is safe against NMI.
There is still an issue remaining which can't be fixed: #MCE
If the AP sits in HLT and receives a broadcast #MCE it will try to
handle it with the obvious consequences.
INIT/INIT clears CR4.MCE in the AP which will cause a broadcast #MCE to
shut down the machine.
So there is a choice between fire (HLT) and frying pan (INIT). Frying
pan has been chosen as it's at least preventing the NMI issue.
On systems which are not using INIT/INIT/STARTUP there is not much
which can be done right now, but at least the obvious and easy to
trigger MWAIT issue has been addressed.
-----BEGIN PGP SIGNATURE-----
iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAmSZfpQTHHRnbHhAbGlu
dXRyb25peC5kZQAKCRCmGPVMDXSYoeZpD/9gSJN2qtGqoOgE8bWAenEeqppmBGFE
EAhuhsvN1qG9JosUFo4KzxsGD/aWt2P6XglBDrGti8mFNol67jutmwWklntL3/ZR
m8D6D+Pl7/CaDgACDTDbrnVC3lOGyMhD301yJrnBigS/SEoHeHI9UtadbHukuLQj
TlKt5KtAnap15bE6QL846cDIptB9SjYLLPULo3i4azXEis/l6eAkffwAR6dmKlBh
2RbhLK1xPPG9nqWYjqZXnex09acKwD9xY9xHj4+GampV4UqHJRWfW0YtFs5ENi01
r3FVCdKEcvMkUw0zh0IAviBRs2vCI/R3YSfEc7P0264yn5WzMhAT+OGCovNjByiW
sB4Iqa+Yf6aoBWwux6W4d22xu7uYhmFk/jiLyRZJPW/gvGZCZATT/x/T2hRoaYA8
3S0Rs7n/gbfvynQETgniifuM0bXRW0lEJAmn840GwyVQwlpDEPBJSwW4El49kbkc
+dHxnmpMCfnBxfVLS1YDd4WOmkWBeECNcW330FShlQQ8mM3UG31+Q8Jc55Ze9SW0
w1h+IgIOHlA0DpQUUM8DJTSuxFx2piQsZxjOtzd70+BiKZpCsHqVLIp4qfnf+/GO
gyP0cCQLbafpABbV9uVy8A/qgUGi0Qii0GJfCTy0OdmU+JX3C2C/gsM3uN0g3qAj
vUhkuCXEGL5k1w==
=KgZ0
-----END PGP SIGNATURE-----
Merge tag 'x86-core-2023-06-26' of ssh://gitolite.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 core updates from Thomas Gleixner:
"A set of fixes for kexec(), reboot and shutdown issues:
- Ensure that the WBINVD in stop_this_cpu() has been completed before
the control CPU proceedes.
stop_this_cpu() is used for kexec(), reboot and shutdown to park
the APs in a HLT loop.
The control CPU sends an IPI to the APs and waits for their CPU
online bits to be cleared. Once they all are marked "offline" it
proceeds.
But stop_this_cpu() clears the CPU online bit before issuing
WBINVD, which means there is no guarantee that the AP has reached
the HLT loop.
This was reported to cause intermittent reboot/shutdown failures
due to some dubious interaction with the firmware.
This is not only a problem of WBINVD. The code to actually "stop"
the CPU which runs between clearing the online bit and reaching the
HLT loop can cause large enough delays on its own (think
virtualization). That's especially dangerous for kexec() as kexec()
expects that all APs are in a safe state and not executing code
while the boot CPU jumps to the new kernel. There are more issues
vs kexec() which are addressed separately.
Cure this by implementing an explicit synchronization point right
before the AP reaches HLT. This guarantees that the AP has
completed the full stop proceedure.
- Fix the condition for WBINVD in stop_this_cpu().
The WBINVD in stop_this_cpu() is required for ensuring that when
switching to or from memory encryption no dirty data is left in the
cache lines which might cause a write back in the wrong more later.
This checks CPUID directly because the feature bit might have been
cleared due to a command line option.
But that CPUID check accesses leaf 0x8000001f::EAX unconditionally.
Intel CPUs return the content of the highest supported leaf when a
non-existing leaf is read, while AMD CPUs return all zeros for
unsupported leafs.
So the result of the test on Intel CPUs is lottery and on AMD its
just correct by chance.
While harmless it's incorrect and causes the conditional wbinvd()
to be issued where not required, which caused the above issue to be
unearthed.
- Make kexec() robust against AP code execution
Ashok observed triple faults when doing kexec() on a system which
had been booted with "nosmt".
It turned out that the SMT siblings which had been brought up
partially are parked in mwait_play_dead() to enable power savings.
mwait_play_dead() is monitoring the thread flags of the AP's idle
task, which has been chosen as it's unlikely to be written to.
But kexec() can overwrite the previous kernel text and data
including page tables etc. When it overwrites the cache lines
monitored by an AP that AP resumes execution after the MWAIT on
eventually overwritten text, stack and page tables, which obviously
might end up in a triple fault easily.
Make this more robust in several steps:
1) Use an explicit per CPU cache line for monitoring.
2) Write a command to these cache lines to kick APs out of MWAIT
before proceeding with kexec(), shutdown or reboot.
The APs confirm the wakeup by writing status back and then
enter a HLT loop.
3) If the system uses INIT/INIT/STARTUP for AP bringup, park the
APs in INIT state.
HLT is not a guarantee that an AP won't wake up and resume
execution. HLT is woken up by NMI and SMI. SMI puts the CPU
back into HLT (+/- firmware bugs), but NMI is delivered to the
CPU which executes the NMI handler. Same issue as the MWAIT
scenario described above.
Sending an INIT/INIT sequence to the APs puts them into wait
for STARTUP state, which is safe against NMI.
There is still an issue remaining which can't be fixed: #MCE
If the AP sits in HLT and receives a broadcast #MCE it will try to
handle it with the obvious consequences.
INIT/INIT clears CR4.MCE in the AP which will cause a broadcast
#MCE to shut down the machine.
So there is a choice between fire (HLT) and frying pan (INIT).
Frying pan has been chosen as it's at least preventing the NMI
issue.
On systems which are not using INIT/INIT/STARTUP there is not much
which can be done right now, but at least the obvious and easy to
trigger MWAIT issue has been addressed"
* tag 'x86-core-2023-06-26' of ssh://gitolite.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/smp: Put CPUs into INIT on shutdown if possible
x86/smp: Split sending INIT IPI out into a helper function
x86/smp: Cure kexec() vs. mwait_play_dead() breakage
x86/smp: Use dedicated cache-line for mwait_play_dead()
x86/smp: Remove pointless wmb()s from native_stop_other_cpus()
x86/smp: Dont access non-existing CPUID leaf
x86/smp: Make stop_other_cpus() more robust
2023-06-27 05:45:53 +08:00
|
|
|
preempt_disable();
|
|
|
|
maxlvt = lapic_get_maxlvt();
|
2023-06-16 04:33:58 +08:00
|
|
|
send_init_sequence(phys_apicid);
|
2008-03-20 01:25:59 +08:00
|
|
|
|
|
|
|
mb();
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Should we send STARTUP IPIs ?
|
|
|
|
*
|
|
|
|
* Determine this based on the APIC version.
|
|
|
|
* If we don't have an integrated APIC, don't send the STARTUP IPIs.
|
|
|
|
*/
|
2016-09-14 02:12:32 +08:00
|
|
|
if (APIC_INTEGRATED(boot_cpu_apic_version))
|
2008-03-20 01:25:59 +08:00
|
|
|
num_starts = 2;
|
|
|
|
else
|
|
|
|
num_starts = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Run STARTUP IPI loop.
|
|
|
|
*/
|
2012-05-22 10:50:07 +08:00
|
|
|
pr_debug("#startup loops: %d\n", num_starts);
|
2008-03-20 01:25:59 +08:00
|
|
|
|
|
|
|
for (j = 1; j <= num_starts; j++) {
|
2012-05-22 10:50:07 +08:00
|
|
|
pr_debug("Sending STARTUP #%d\n", j);
|
x86: APIC: remove apic_write_around(); use alternatives
Use alternatives to select the workaround for the 11AP Pentium erratum
for the affected steppings on the fly rather than build time. Remove the
X86_GOOD_APIC configuration option and replace all the calls to
apic_write_around() with plain apic_write(), protecting accesses to the
ESR as appropriate due to the 3AP Pentium erratum. Remove
apic_read_around() and all its invocations altogether as not needed.
Remove apic_write_atomic() and all its implementing backends. The use of
ASM_OUTPUT2() is not strictly needed for input constraints, but I have
used it for readability's sake.
I had the feeling no one else was brave enough to do it, so I went ahead
and here it is. Verified by checking the generated assembly and tested
with both a 32-bit and a 64-bit configuration, also with the 11AP
"feature" forced on and verified with gdb on /proc/kcore to work as
expected (as an 11AP machines are quite hard to get hands on these days).
Some script complained about the use of "volatile", but apic_write() needs
it for the same reason and is effectively a replacement for writel(), so I
have disregarded it.
I am not sure what the policy wrt defconfig files is, they are generated
and there is risk of a conflict resulting from an unrelated change, so I
have left changes to them out. The option will get removed from them at
the next run.
Some testing with machines other than mine will be needed to avoid some
stupid mistake, but despite its volume, the change is not really that
intrusive, so I am fairly confident that because it works for me, it will
everywhere.
Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-07-17 02:15:30 +08:00
|
|
|
if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
|
|
|
|
apic_write(APIC_ESR, 0);
|
2008-03-20 01:25:59 +08:00
|
|
|
apic_read(APIC_ESR);
|
2012-05-22 10:50:07 +08:00
|
|
|
pr_debug("After apic_write\n");
|
2008-03-20 01:25:59 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* STARTUP IPI
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* Target chip */
|
|
|
|
/* Boot on the stack */
|
|
|
|
/* Kick the second */
|
2008-07-11 02:16:49 +08:00
|
|
|
apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12),
|
|
|
|
phys_apicid);
|
2008-03-20 01:25:59 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Give the other CPU some time to accept the IPI.
|
|
|
|
*/
|
2015-10-16 12:14:29 +08:00
|
|
|
if (init_udelay == 0)
|
|
|
|
udelay(10);
|
|
|
|
else
|
2015-08-16 23:45:47 +08:00
|
|
|
udelay(300);
|
2008-03-20 01:25:59 +08:00
|
|
|
|
2012-05-22 10:50:07 +08:00
|
|
|
pr_debug("Startup point 1\n");
|
2008-03-20 01:25:59 +08:00
|
|
|
|
2008-07-22 03:35:38 +08:00
|
|
|
pr_debug("Waiting for send to finish...\n");
|
2008-03-20 01:25:59 +08:00
|
|
|
send_status = safe_apic_wait_icr_idle();
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Give the other CPU some time to accept the IPI.
|
|
|
|
*/
|
2015-10-16 12:14:29 +08:00
|
|
|
if (init_udelay == 0)
|
|
|
|
udelay(10);
|
|
|
|
else
|
2015-08-16 23:45:47 +08:00
|
|
|
udelay(200);
|
2008-03-20 01:25:59 +08:00
|
|
|
|
x86: APIC: remove apic_write_around(); use alternatives
Use alternatives to select the workaround for the 11AP Pentium erratum
for the affected steppings on the fly rather than build time. Remove the
X86_GOOD_APIC configuration option and replace all the calls to
apic_write_around() with plain apic_write(), protecting accesses to the
ESR as appropriate due to the 3AP Pentium erratum. Remove
apic_read_around() and all its invocations altogether as not needed.
Remove apic_write_atomic() and all its implementing backends. The use of
ASM_OUTPUT2() is not strictly needed for input constraints, but I have
used it for readability's sake.
I had the feeling no one else was brave enough to do it, so I went ahead
and here it is. Verified by checking the generated assembly and tested
with both a 32-bit and a 64-bit configuration, also with the 11AP
"feature" forced on and verified with gdb on /proc/kcore to work as
expected (as an 11AP machines are quite hard to get hands on these days).
Some script complained about the use of "volatile", but apic_write() needs
it for the same reason and is effectively a replacement for writel(), so I
have disregarded it.
I am not sure what the policy wrt defconfig files is, they are generated
and there is risk of a conflict resulting from an unrelated change, so I
have left changes to them out. The option will get removed from them at
the next run.
Some testing with machines other than mine will be needed to avoid some
stupid mistake, but despite its volume, the change is not really that
intrusive, so I am fairly confident that because it works for me, it will
everywhere.
Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-07-17 02:15:30 +08:00
|
|
|
if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
|
2008-03-20 01:25:59 +08:00
|
|
|
apic_write(APIC_ESR, 0);
|
|
|
|
accept_status = (apic_read(APIC_ESR) & 0xEF);
|
|
|
|
if (send_status || accept_status)
|
|
|
|
break;
|
|
|
|
}
|
2012-05-22 10:50:07 +08:00
|
|
|
pr_debug("After Startup\n");
|
2008-03-20 01:25:59 +08:00
|
|
|
|
|
|
|
if (send_status)
|
2012-05-22 10:50:07 +08:00
|
|
|
pr_err("APIC never delivered???\n");
|
2008-03-20 01:25:59 +08:00
|
|
|
if (accept_status)
|
2012-05-22 10:50:07 +08:00
|
|
|
pr_err("APIC delivery error (%lx)\n", accept_status);
|
2008-03-20 01:25:59 +08:00
|
|
|
|
2023-05-13 05:07:06 +08:00
|
|
|
preempt_enable();
|
2008-03-20 01:25:59 +08:00
|
|
|
return (send_status | accept_status);
|
|
|
|
}
|
|
|
|
|
2009-12-11 09:19:36 +08:00
|
|
|
/* reduce the number of lines printed when booting a large cpu count system */
|
x86: delete __cpuinit usage from all x86 files
The __cpuinit type of throwaway sections might have made sense
some time ago when RAM was more constrained, but now the savings
do not offset the cost and complications. For example, the fix in
commit 5e427ec2d0 ("x86: Fix bit corruption at CPU resume time")
is a good example of the nasty type of bugs that can be created
with improper use of the various __init prefixes.
After a discussion on LKML[1] it was decided that cpuinit should go
the way of devinit and be phased out. Once all the users are gone,
we can then finally remove the macros themselves from linux/init.h.
Note that some harmless section mismatch warnings may result, since
notify_cpu_starting() and cpu_up() are arch independent (kernel/cpu.c)
are flagged as __cpuinit -- so if we remove the __cpuinit from
arch specific callers, we will also get section mismatch warnings.
As an intermediate step, we intend to turn the linux/init.h cpuinit
content into no-ops as early as possible, since that will get rid
of these warnings. In any case, they are temporary and harmless.
This removes all the arch/x86 uses of the __cpuinit macros from
all C files. x86 only had the one __CPUINIT used in assembly files,
and it wasn't paired off with a .previous or a __FINIT, so we can
delete it directly w/o any corresponding additional change there.
[1] https://lkml.org/lkml/2013/5/20/589
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2013-06-19 06:23:59 +08:00
|
|
|
static void announce_cpu(int cpu, int apicid)
|
2009-12-11 09:19:36 +08:00
|
|
|
{
|
2023-05-13 05:07:56 +08:00
|
|
|
static int width, node_width, first = 1;
|
2019-03-06 07:42:58 +08:00
|
|
|
static int current_node = NUMA_NO_NODE;
|
2010-06-02 03:04:55 +08:00
|
|
|
int node = early_cpu_to_node(cpu);
|
x86: Improve the printout of the SMP bootup CPU table
As the new x86 CPU bootup printout format code maintainer, I am
taking immediate action to improve and clean (and thus indulge
my OCD) the reporting of the cores when coming up online.
Fix padding to a right-hand alignment, cleanup code and bind
reporting width to the max number of supported CPUs on the
system, like this:
[ 0.074509] smpboot: Booting Node 0, Processors: #1 #2 #3 #4 #5 #6 #7 OK
[ 0.644008] smpboot: Booting Node 1, Processors: #8 #9 #10 #11 #12 #13 #14 #15 OK
[ 1.245006] smpboot: Booting Node 2, Processors: #16 #17 #18 #19 #20 #21 #22 #23 OK
[ 1.864005] smpboot: Booting Node 3, Processors: #24 #25 #26 #27 #28 #29 #30 #31 OK
[ 2.489005] smpboot: Booting Node 4, Processors: #32 #33 #34 #35 #36 #37 #38 #39 OK
[ 3.093005] smpboot: Booting Node 5, Processors: #40 #41 #42 #43 #44 #45 #46 #47 OK
[ 3.698005] smpboot: Booting Node 6, Processors: #48 #49 #50 #51 #52 #53 #54 #55 OK
[ 4.304005] smpboot: Booting Node 7, Processors: #56 #57 #58 #59 #60 #61 #62 #63 OK
[ 4.961413] Brought up 64 CPUs
and this:
[ 0.072367] smpboot: Booting Node 0, Processors: #1 #2 #3 #4 #5 #6 #7 OK
[ 0.686329] Brought up 8 CPUs
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Libin <huawei.libin@huawei.com>
Cc: wangyijing@huawei.com
Cc: fenghua.yu@intel.com
Cc: guohanjun@huawei.com
Cc: paul.gortmaker@windriver.com
Link: http://lkml.kernel.org/r/20130927143554.GF4422@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-09-27 22:35:54 +08:00
|
|
|
|
|
|
|
if (!width)
|
|
|
|
width = num_digits(num_possible_cpus()) + 1; /* + '#' sign */
|
2009-12-11 09:19:36 +08:00
|
|
|
|
x86/boot: Further compress CPUs bootup message
Turn it into (for example):
[ 0.073380] x86: Booting SMP configuration:
[ 0.074005] .... node #0, CPUs: #1 #2 #3 #4 #5 #6 #7
[ 0.603005] .... node #1, CPUs: #8 #9 #10 #11 #12 #13 #14 #15
[ 1.200005] .... node #2, CPUs: #16 #17 #18 #19 #20 #21 #22 #23
[ 1.796005] .... node #3, CPUs: #24 #25 #26 #27 #28 #29 #30 #31
[ 2.393005] .... node #4, CPUs: #32 #33 #34 #35 #36 #37 #38 #39
[ 2.996005] .... node #5, CPUs: #40 #41 #42 #43 #44 #45 #46 #47
[ 3.600005] .... node #6, CPUs: #48 #49 #50 #51 #52 #53 #54 #55
[ 4.202005] .... node #7, CPUs: #56 #57 #58 #59 #60 #61 #62 #63
[ 4.811005] .... node #8, CPUs: #64 #65 #66 #67 #68 #69 #70 #71
[ 5.421006] .... node #9, CPUs: #72 #73 #74 #75 #76 #77 #78 #79
[ 6.032005] .... node #10, CPUs: #80 #81 #82 #83 #84 #85 #86 #87
[ 6.648006] .... node #11, CPUs: #88 #89 #90 #91 #92 #93 #94 #95
[ 7.262005] .... node #12, CPUs: #96 #97 #98 #99 #100 #101 #102 #103
[ 7.865005] .... node #13, CPUs: #104 #105 #106 #107 #108 #109 #110 #111
[ 8.466005] .... node #14, CPUs: #112 #113 #114 #115 #116 #117 #118 #119
[ 9.073006] .... node #15, CPUs: #120 #121 #122 #123 #124 #125 #126 #127
[ 9.679901] x86: Booted up 16 nodes, 128 CPUs
and drop useless elements.
Change num_digits() to hpa's division-avoiding, cell-phone-typed
version which he went at great lengths and pains to submit on a
Saturday evening.
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: huawei.libin@huawei.com
Cc: wangyijing@huawei.com
Cc: fenghua.yu@intel.com
Cc: guohanjun@huawei.com
Cc: paul.gortmaker@windriver.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20130930095624.GB16383@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-09-30 17:56:24 +08:00
|
|
|
if (!node_width)
|
|
|
|
node_width = num_digits(num_possible_nodes()) + 1; /* + '#' */
|
|
|
|
|
2017-05-17 02:42:35 +08:00
|
|
|
if (system_state < SYSTEM_RUNNING) {
|
2023-05-13 05:07:56 +08:00
|
|
|
if (first)
|
|
|
|
pr_info("x86: Booting SMP configuration:\n");
|
|
|
|
|
2009-12-11 09:19:36 +08:00
|
|
|
if (node != current_node) {
|
|
|
|
if (current_node > (-1))
|
x86/boot: Further compress CPUs bootup message
Turn it into (for example):
[ 0.073380] x86: Booting SMP configuration:
[ 0.074005] .... node #0, CPUs: #1 #2 #3 #4 #5 #6 #7
[ 0.603005] .... node #1, CPUs: #8 #9 #10 #11 #12 #13 #14 #15
[ 1.200005] .... node #2, CPUs: #16 #17 #18 #19 #20 #21 #22 #23
[ 1.796005] .... node #3, CPUs: #24 #25 #26 #27 #28 #29 #30 #31
[ 2.393005] .... node #4, CPUs: #32 #33 #34 #35 #36 #37 #38 #39
[ 2.996005] .... node #5, CPUs: #40 #41 #42 #43 #44 #45 #46 #47
[ 3.600005] .... node #6, CPUs: #48 #49 #50 #51 #52 #53 #54 #55
[ 4.202005] .... node #7, CPUs: #56 #57 #58 #59 #60 #61 #62 #63
[ 4.811005] .... node #8, CPUs: #64 #65 #66 #67 #68 #69 #70 #71
[ 5.421006] .... node #9, CPUs: #72 #73 #74 #75 #76 #77 #78 #79
[ 6.032005] .... node #10, CPUs: #80 #81 #82 #83 #84 #85 #86 #87
[ 6.648006] .... node #11, CPUs: #88 #89 #90 #91 #92 #93 #94 #95
[ 7.262005] .... node #12, CPUs: #96 #97 #98 #99 #100 #101 #102 #103
[ 7.865005] .... node #13, CPUs: #104 #105 #106 #107 #108 #109 #110 #111
[ 8.466005] .... node #14, CPUs: #112 #113 #114 #115 #116 #117 #118 #119
[ 9.073006] .... node #15, CPUs: #120 #121 #122 #123 #124 #125 #126 #127
[ 9.679901] x86: Booted up 16 nodes, 128 CPUs
and drop useless elements.
Change num_digits() to hpa's division-avoiding, cell-phone-typed
version which he went at great lengths and pains to submit on a
Saturday evening.
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: huawei.libin@huawei.com
Cc: wangyijing@huawei.com
Cc: fenghua.yu@intel.com
Cc: guohanjun@huawei.com
Cc: paul.gortmaker@windriver.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20130930095624.GB16383@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-09-30 17:56:24 +08:00
|
|
|
pr_cont("\n");
|
2009-12-11 09:19:36 +08:00
|
|
|
current_node = node;
|
x86/boot: Further compress CPUs bootup message
Turn it into (for example):
[ 0.073380] x86: Booting SMP configuration:
[ 0.074005] .... node #0, CPUs: #1 #2 #3 #4 #5 #6 #7
[ 0.603005] .... node #1, CPUs: #8 #9 #10 #11 #12 #13 #14 #15
[ 1.200005] .... node #2, CPUs: #16 #17 #18 #19 #20 #21 #22 #23
[ 1.796005] .... node #3, CPUs: #24 #25 #26 #27 #28 #29 #30 #31
[ 2.393005] .... node #4, CPUs: #32 #33 #34 #35 #36 #37 #38 #39
[ 2.996005] .... node #5, CPUs: #40 #41 #42 #43 #44 #45 #46 #47
[ 3.600005] .... node #6, CPUs: #48 #49 #50 #51 #52 #53 #54 #55
[ 4.202005] .... node #7, CPUs: #56 #57 #58 #59 #60 #61 #62 #63
[ 4.811005] .... node #8, CPUs: #64 #65 #66 #67 #68 #69 #70 #71
[ 5.421006] .... node #9, CPUs: #72 #73 #74 #75 #76 #77 #78 #79
[ 6.032005] .... node #10, CPUs: #80 #81 #82 #83 #84 #85 #86 #87
[ 6.648006] .... node #11, CPUs: #88 #89 #90 #91 #92 #93 #94 #95
[ 7.262005] .... node #12, CPUs: #96 #97 #98 #99 #100 #101 #102 #103
[ 7.865005] .... node #13, CPUs: #104 #105 #106 #107 #108 #109 #110 #111
[ 8.466005] .... node #14, CPUs: #112 #113 #114 #115 #116 #117 #118 #119
[ 9.073006] .... node #15, CPUs: #120 #121 #122 #123 #124 #125 #126 #127
[ 9.679901] x86: Booted up 16 nodes, 128 CPUs
and drop useless elements.
Change num_digits() to hpa's division-avoiding, cell-phone-typed
version which he went at great lengths and pains to submit on a
Saturday evening.
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: huawei.libin@huawei.com
Cc: wangyijing@huawei.com
Cc: fenghua.yu@intel.com
Cc: guohanjun@huawei.com
Cc: paul.gortmaker@windriver.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20130930095624.GB16383@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-09-30 17:56:24 +08:00
|
|
|
|
|
|
|
printk(KERN_INFO ".... node %*s#%d, CPUs: ",
|
|
|
|
node_width - num_digits(node), " ", node);
|
2009-12-11 09:19:36 +08:00
|
|
|
}
|
x86: Improve the printout of the SMP bootup CPU table
As the new x86 CPU bootup printout format code maintainer, I am
taking immediate action to improve and clean (and thus indulge
my OCD) the reporting of the cores when coming up online.
Fix padding to a right-hand alignment, cleanup code and bind
reporting width to the max number of supported CPUs on the
system, like this:
[ 0.074509] smpboot: Booting Node 0, Processors: #1 #2 #3 #4 #5 #6 #7 OK
[ 0.644008] smpboot: Booting Node 1, Processors: #8 #9 #10 #11 #12 #13 #14 #15 OK
[ 1.245006] smpboot: Booting Node 2, Processors: #16 #17 #18 #19 #20 #21 #22 #23 OK
[ 1.864005] smpboot: Booting Node 3, Processors: #24 #25 #26 #27 #28 #29 #30 #31 OK
[ 2.489005] smpboot: Booting Node 4, Processors: #32 #33 #34 #35 #36 #37 #38 #39 OK
[ 3.093005] smpboot: Booting Node 5, Processors: #40 #41 #42 #43 #44 #45 #46 #47 OK
[ 3.698005] smpboot: Booting Node 6, Processors: #48 #49 #50 #51 #52 #53 #54 #55 OK
[ 4.304005] smpboot: Booting Node 7, Processors: #56 #57 #58 #59 #60 #61 #62 #63 OK
[ 4.961413] Brought up 64 CPUs
and this:
[ 0.072367] smpboot: Booting Node 0, Processors: #1 #2 #3 #4 #5 #6 #7 OK
[ 0.686329] Brought up 8 CPUs
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Libin <huawei.libin@huawei.com>
Cc: wangyijing@huawei.com
Cc: fenghua.yu@intel.com
Cc: guohanjun@huawei.com
Cc: paul.gortmaker@windriver.com
Link: http://lkml.kernel.org/r/20130927143554.GF4422@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-09-27 22:35:54 +08:00
|
|
|
|
|
|
|
/* Add padding for the BSP */
|
2023-05-13 05:07:56 +08:00
|
|
|
if (first)
|
x86: Improve the printout of the SMP bootup CPU table
As the new x86 CPU bootup printout format code maintainer, I am
taking immediate action to improve and clean (and thus indulge
my OCD) the reporting of the cores when coming up online.
Fix padding to a right-hand alignment, cleanup code and bind
reporting width to the max number of supported CPUs on the
system, like this:
[ 0.074509] smpboot: Booting Node 0, Processors: #1 #2 #3 #4 #5 #6 #7 OK
[ 0.644008] smpboot: Booting Node 1, Processors: #8 #9 #10 #11 #12 #13 #14 #15 OK
[ 1.245006] smpboot: Booting Node 2, Processors: #16 #17 #18 #19 #20 #21 #22 #23 OK
[ 1.864005] smpboot: Booting Node 3, Processors: #24 #25 #26 #27 #28 #29 #30 #31 OK
[ 2.489005] smpboot: Booting Node 4, Processors: #32 #33 #34 #35 #36 #37 #38 #39 OK
[ 3.093005] smpboot: Booting Node 5, Processors: #40 #41 #42 #43 #44 #45 #46 #47 OK
[ 3.698005] smpboot: Booting Node 6, Processors: #48 #49 #50 #51 #52 #53 #54 #55 OK
[ 4.304005] smpboot: Booting Node 7, Processors: #56 #57 #58 #59 #60 #61 #62 #63 OK
[ 4.961413] Brought up 64 CPUs
and this:
[ 0.072367] smpboot: Booting Node 0, Processors: #1 #2 #3 #4 #5 #6 #7 OK
[ 0.686329] Brought up 8 CPUs
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Libin <huawei.libin@huawei.com>
Cc: wangyijing@huawei.com
Cc: fenghua.yu@intel.com
Cc: guohanjun@huawei.com
Cc: paul.gortmaker@windriver.com
Link: http://lkml.kernel.org/r/20130927143554.GF4422@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-09-27 22:35:54 +08:00
|
|
|
pr_cont("%*s", width + 1, " ");
|
2023-05-13 05:07:56 +08:00
|
|
|
first = 0;
|
x86: Improve the printout of the SMP bootup CPU table
As the new x86 CPU bootup printout format code maintainer, I am
taking immediate action to improve and clean (and thus indulge
my OCD) the reporting of the cores when coming up online.
Fix padding to a right-hand alignment, cleanup code and bind
reporting width to the max number of supported CPUs on the
system, like this:
[ 0.074509] smpboot: Booting Node 0, Processors: #1 #2 #3 #4 #5 #6 #7 OK
[ 0.644008] smpboot: Booting Node 1, Processors: #8 #9 #10 #11 #12 #13 #14 #15 OK
[ 1.245006] smpboot: Booting Node 2, Processors: #16 #17 #18 #19 #20 #21 #22 #23 OK
[ 1.864005] smpboot: Booting Node 3, Processors: #24 #25 #26 #27 #28 #29 #30 #31 OK
[ 2.489005] smpboot: Booting Node 4, Processors: #32 #33 #34 #35 #36 #37 #38 #39 OK
[ 3.093005] smpboot: Booting Node 5, Processors: #40 #41 #42 #43 #44 #45 #46 #47 OK
[ 3.698005] smpboot: Booting Node 6, Processors: #48 #49 #50 #51 #52 #53 #54 #55 OK
[ 4.304005] smpboot: Booting Node 7, Processors: #56 #57 #58 #59 #60 #61 #62 #63 OK
[ 4.961413] Brought up 64 CPUs
and this:
[ 0.072367] smpboot: Booting Node 0, Processors: #1 #2 #3 #4 #5 #6 #7 OK
[ 0.686329] Brought up 8 CPUs
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Libin <huawei.libin@huawei.com>
Cc: wangyijing@huawei.com
Cc: fenghua.yu@intel.com
Cc: guohanjun@huawei.com
Cc: paul.gortmaker@windriver.com
Link: http://lkml.kernel.org/r/20130927143554.GF4422@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-09-27 22:35:54 +08:00
|
|
|
|
|
|
|
pr_cont("%*s#%d", width - num_digits(cpu), " ", cpu);
|
2009-12-11 09:19:36 +08:00
|
|
|
} else
|
|
|
|
pr_info("Booting Node %d Processor %d APIC 0x%x\n",
|
|
|
|
node, cpu, apicid);
|
|
|
|
}
|
|
|
|
|
2019-04-15 00:00:04 +08:00
|
|
|
int common_cpu_up(unsigned int cpu, struct task_struct *idle)
|
2015-04-01 22:12:14 +08:00
|
|
|
{
|
2019-04-15 00:00:04 +08:00
|
|
|
int ret;
|
|
|
|
|
2015-04-01 22:12:14 +08:00
|
|
|
/* Just in case we booted with a single CPU. */
|
|
|
|
alternatives_enable_smp();
|
|
|
|
|
2022-09-15 19:11:01 +08:00
|
|
|
per_cpu(pcpu_hot.current_task, cpu) = idle;
|
2020-06-18 06:56:24 +08:00
|
|
|
cpu_init_stack_canary(cpu, idle);
|
2015-04-01 22:12:14 +08:00
|
|
|
|
2019-04-15 00:00:04 +08:00
|
|
|
/* Initialize the interrupt stack(s) */
|
|
|
|
ret = irq_init_percpu_irqstack(cpu);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2015-04-01 22:12:14 +08:00
|
|
|
#ifdef CONFIG_X86_32
|
|
|
|
/* Stack for startup_32 can be just as for start_secondary onwards */
|
2022-09-15 19:11:04 +08:00
|
|
|
per_cpu(pcpu_hot.top_of_stack, cpu) = task_top_of_stack(idle);
|
2015-04-01 22:12:14 +08:00
|
|
|
#endif
|
2019-04-15 00:00:04 +08:00
|
|
|
return 0;
|
2015-04-01 22:12:14 +08:00
|
|
|
}
|
|
|
|
|
2008-03-20 01:25:59 +08:00
|
|
|
/*
|
|
|
|
* NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
|
|
|
|
* (ie clustered apic addressing mode), this is a LOGICAL apic ID.
|
2023-05-13 05:07:11 +08:00
|
|
|
* Returns zero if startup was successfully sent, else error code from
|
2009-02-26 20:51:40 +08:00
|
|
|
* ->wakeup_secondary_cpu.
|
2008-03-20 01:25:59 +08:00
|
|
|
*/
|
2023-05-13 05:07:06 +08:00
|
|
|
static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
|
2008-03-20 01:25:59 +08:00
|
|
|
{
|
2012-05-09 02:22:43 +08:00
|
|
|
unsigned long start_ip = real_mode_header->trampoline_start;
|
2023-05-13 05:07:29 +08:00
|
|
|
int ret;
|
2008-03-20 01:25:59 +08:00
|
|
|
|
2022-04-06 07:29:29 +08:00
|
|
|
#ifdef CONFIG_X86_64
|
|
|
|
/* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */
|
|
|
|
if (apic->wakeup_secondary_cpu_64)
|
|
|
|
start_ip = real_mode_header->trampoline_start64;
|
|
|
|
#endif
|
2016-09-22 05:04:03 +08:00
|
|
|
idle->thread.sp = (unsigned long)task_pt_regs(idle);
|
2008-05-29 00:01:54 +08:00
|
|
|
initial_code = (unsigned long)start_secondary;
|
2023-03-17 06:21:03 +08:00
|
|
|
|
|
|
|
if (IS_ENABLED(CONFIG_X86_32)) {
|
2023-03-17 06:21:04 +08:00
|
|
|
early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
|
2023-03-17 06:21:03 +08:00
|
|
|
initial_stack = idle->thread.sp;
|
2023-05-13 05:07:55 +08:00
|
|
|
} else if (!(smpboot_control & STARTUP_PARALLEL_MASK)) {
|
2023-03-17 06:21:03 +08:00
|
|
|
smpboot_control = cpu;
|
|
|
|
}
|
2008-03-20 01:25:59 +08:00
|
|
|
|
2017-12-17 17:56:29 +08:00
|
|
|
/* Enable the espfix hack for this CPU */
|
2015-07-03 17:37:19 +08:00
|
|
|
init_espfix_ap(cpu);
|
|
|
|
|
2009-12-11 09:19:36 +08:00
|
|
|
/* So we see what's up */
|
|
|
|
announce_cpu(cpu, apicid);
|
2008-03-20 01:25:59 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* This grunge runs the startup process for
|
|
|
|
* the targeted processor.
|
|
|
|
*/
|
2017-11-27 16:11:44 +08:00
|
|
|
if (x86_platform.legacy.warm_reset) {
|
2008-03-20 01:25:59 +08:00
|
|
|
|
2008-07-22 03:35:38 +08:00
|
|
|
pr_debug("Setting warm reset code and vector.\n");
|
2008-03-20 01:25:59 +08:00
|
|
|
|
2008-04-17 00:45:15 +08:00
|
|
|
smpboot_setup_warm_reset_vector(start_ip);
|
|
|
|
/*
|
|
|
|
* Be paranoid about clearing APIC errors.
|
2008-10-22 22:00:09 +08:00
|
|
|
*/
|
2016-09-14 02:12:32 +08:00
|
|
|
if (APIC_INTEGRATED(boot_cpu_apic_version)) {
|
2008-10-22 22:00:09 +08:00
|
|
|
apic_write(APIC_ESR, 0);
|
|
|
|
apic_read(APIC_ESR);
|
|
|
|
}
|
2008-04-17 00:45:15 +08:00
|
|
|
}
|
2008-03-20 01:25:59 +08:00
|
|
|
|
2014-06-20 20:23:11 +08:00
|
|
|
smp_mb();
|
|
|
|
|
2008-03-20 01:25:59 +08:00
|
|
|
/*
|
2012-11-14 20:36:53 +08:00
|
|
|
* Wake up a CPU in difference cases:
|
2022-04-06 07:29:29 +08:00
|
|
|
* - Use a method from the APIC driver if one defined, with wakeup
|
|
|
|
* straight to 64-bit mode preferred over wakeup to RM.
|
2012-11-14 20:36:53 +08:00
|
|
|
* Otherwise,
|
2023-05-13 05:07:06 +08:00
|
|
|
* - Use an INIT boot APIC message
|
2008-03-20 01:25:59 +08:00
|
|
|
*/
|
2022-04-06 07:29:29 +08:00
|
|
|
if (apic->wakeup_secondary_cpu_64)
|
2023-05-13 05:07:29 +08:00
|
|
|
ret = apic->wakeup_secondary_cpu_64(apicid, start_ip);
|
2022-04-06 07:29:29 +08:00
|
|
|
else if (apic->wakeup_secondary_cpu)
|
2023-05-13 05:07:29 +08:00
|
|
|
ret = apic->wakeup_secondary_cpu(apicid, start_ip);
|
2009-02-26 20:51:40 +08:00
|
|
|
else
|
2023-05-13 05:07:29 +08:00
|
|
|
ret = wakeup_secondary_cpu_via_init(apicid, start_ip);
|
2012-11-14 20:36:53 +08:00
|
|
|
|
2023-05-13 05:07:29 +08:00
|
|
|
/* If the wakeup mechanism failed, cleanup the warm reset vector */
|
|
|
|
if (ret)
|
|
|
|
arch_cpuhp_cleanup_kick_cpu(cpu);
|
|
|
|
return ret;
|
2008-03-20 01:25:59 +08:00
|
|
|
}
|
|
|
|
|
2023-05-13 05:07:46 +08:00
|
|
|
int native_kick_ap(unsigned int cpu, struct task_struct *tidle)
|
2008-03-20 01:25:59 +08:00
|
|
|
{
|
2009-01-28 13:50:47 +08:00
|
|
|
int apicid = apic->cpu_present_to_apicid(cpu);
|
2023-05-13 05:07:06 +08:00
|
|
|
int err;
|
2008-03-20 01:25:59 +08:00
|
|
|
|
2017-11-06 23:01:23 +08:00
|
|
|
lockdep_assert_irqs_enabled();
|
2008-03-20 01:25:59 +08:00
|
|
|
|
2008-07-22 03:35:38 +08:00
|
|
|
pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu);
|
2008-03-20 01:25:59 +08:00
|
|
|
|
2023-08-09 06:04:09 +08:00
|
|
|
if (apicid == BAD_APICID || !physid_isset(apicid, phys_cpu_present_map) ||
|
|
|
|
!apic_id_valid(apicid)) {
|
2012-05-22 10:50:07 +08:00
|
|
|
pr_err("%s: bad cpu %d\n", __func__, cpu);
|
2008-03-20 01:25:59 +08:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Save current MTRR state in case it was changed since early boot
|
|
|
|
* (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
|
|
|
|
*/
|
|
|
|
mtrr_save_state();
|
|
|
|
|
2012-12-01 04:15:32 +08:00
|
|
|
/* the FPU context is blank, nobody can own it */
|
2016-10-14 20:15:30 +08:00
|
|
|
per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
|
2012-12-01 04:15:32 +08:00
|
|
|
|
2019-04-15 00:00:04 +08:00
|
|
|
err = common_cpu_up(cpu, tidle);
|
|
|
|
if (err)
|
|
|
|
return err;
|
2015-04-01 22:12:14 +08:00
|
|
|
|
2023-05-13 05:07:06 +08:00
|
|
|
err = do_boot_cpu(apicid, cpu, tidle);
|
2023-05-13 05:07:11 +08:00
|
|
|
if (err)
|
2014-06-05 21:42:44 +08:00
|
|
|
pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
|
2008-03-20 01:25:59 +08:00
|
|
|
|
2023-05-13 05:07:11 +08:00
|
|
|
return err;
|
|
|
|
}
|
2008-03-20 01:25:59 +08:00
|
|
|
|
2023-05-13 05:07:46 +08:00
|
|
|
int arch_cpuhp_kick_ap_alive(unsigned int cpu, struct task_struct *tidle)
|
2023-05-13 05:07:11 +08:00
|
|
|
{
|
2023-05-13 05:07:46 +08:00
|
|
|
return smp_ops.kick_ap_alive(cpu, tidle);
|
2023-05-13 05:07:29 +08:00
|
|
|
}
|
2008-03-20 01:25:59 +08:00
|
|
|
|
2023-05-13 05:07:29 +08:00
|
|
|
void arch_cpuhp_cleanup_kick_cpu(unsigned int cpu)
|
|
|
|
{
|
2023-05-13 05:07:11 +08:00
|
|
|
/* Cleanup possible dangling ends... */
|
2023-05-13 05:07:46 +08:00
|
|
|
if (smp_ops.kick_ap_alive == native_kick_ap && x86_platform.legacy.warm_reset)
|
2023-05-13 05:07:11 +08:00
|
|
|
smpboot_restore_warm_reset_vector();
|
2023-05-13 05:07:29 +08:00
|
|
|
}
|
2017-08-03 18:58:18 +08:00
|
|
|
|
2023-05-13 05:07:29 +08:00
|
|
|
void arch_cpuhp_cleanup_dead_cpu(unsigned int cpu)
|
|
|
|
{
|
|
|
|
if (smp_ops.cleanup_dead_cpu)
|
|
|
|
smp_ops.cleanup_dead_cpu(cpu);
|
|
|
|
|
|
|
|
if (system_state == SYSTEM_RUNNING)
|
|
|
|
pr_info("CPU %u is now offline\n", cpu);
|
|
|
|
}
|
|
|
|
|
|
|
|
void arch_cpuhp_sync_state_poll(void)
|
|
|
|
{
|
|
|
|
if (smp_ops.poll_sync_state)
|
|
|
|
smp_ops.poll_sync_state();
|
2008-03-20 01:25:59 +08:00
|
|
|
}
|
|
|
|
|
2011-02-22 22:38:05 +08:00
|
|
|
/**
|
2023-05-13 05:07:00 +08:00
|
|
|
* arch_disable_smp_support() - Disables SMP support for x86 at boottime
|
2011-02-22 22:38:05 +08:00
|
|
|
*/
|
2023-05-13 05:07:00 +08:00
|
|
|
void __init arch_disable_smp_support(void)
|
2011-02-22 22:38:05 +08:00
|
|
|
{
|
|
|
|
disable_ioapic_support();
|
|
|
|
}
|
|
|
|
|
2008-03-20 01:26:11 +08:00
|
|
|
/*
|
|
|
|
* Fall back to non SMP mode after errors.
|
|
|
|
*
|
|
|
|
* RED-PEN audit/test this more. I bet there is more state messed up here.
|
|
|
|
*/
|
|
|
|
static __init void disable_smp(void)
|
|
|
|
{
|
2015-01-16 05:22:42 +08:00
|
|
|
pr_info("SMP disabled\n");
|
|
|
|
|
2015-01-16 05:22:35 +08:00
|
|
|
disable_ioapic_support();
|
|
|
|
|
2009-03-13 12:19:54 +08:00
|
|
|
init_cpu_present(cpumask_of(0));
|
|
|
|
init_cpu_possible(cpumask_of(0));
|
2008-05-29 08:09:53 +08:00
|
|
|
|
2008-03-20 01:26:11 +08:00
|
|
|
if (smp_found_config)
|
2008-06-20 10:51:05 +08:00
|
|
|
physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
|
2008-03-20 01:26:11 +08:00
|
|
|
else
|
2008-06-20 10:51:05 +08:00
|
|
|
physid_set_mask_of_physid(0, &phys_cpu_present_map);
|
2015-05-26 21:11:35 +08:00
|
|
|
cpumask_set_cpu(0, topology_sibling_cpumask(0));
|
|
|
|
cpumask_set_cpu(0, topology_core_cpumask(0));
|
2019-05-14 01:58:56 +08:00
|
|
|
cpumask_set_cpu(0, topology_die_cpumask(0));
|
2008-03-20 01:26:11 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void __init smp_cpu_index_default(void)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
struct cpuinfo_x86 *c;
|
|
|
|
|
2008-04-19 22:55:17 +08:00
|
|
|
for_each_possible_cpu(i) {
|
2008-03-20 01:26:11 +08:00
|
|
|
c = &cpu_data(i);
|
|
|
|
/* mark all to hotplug */
|
2009-01-01 10:08:46 +08:00
|
|
|
c->cpu_index = nr_cpu_ids;
|
2008-03-20 01:26:11 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-11-03 07:36:36 +08:00
|
|
|
void __init smp_prepare_cpus_common(void)
|
2008-03-20 01:26:11 +08:00
|
|
|
{
|
2009-03-13 12:19:50 +08:00
|
|
|
unsigned int i;
|
|
|
|
|
2008-03-20 01:26:11 +08:00
|
|
|
smp_cpu_index_default();
|
2011-01-22 07:29:54 +08:00
|
|
|
|
2008-03-20 01:26:11 +08:00
|
|
|
/*
|
|
|
|
* Setup boot CPU information
|
|
|
|
*/
|
2012-11-14 03:32:41 +08:00
|
|
|
smp_store_boot_cpu_info(); /* Final full version of the data */
|
2011-01-22 07:29:54 +08:00
|
|
|
mb();
|
2011-01-23 21:37:27 +08:00
|
|
|
|
2009-03-13 12:19:50 +08:00
|
|
|
for_each_possible_cpu(i) {
|
2009-06-15 14:58:26 +08:00
|
|
|
zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
|
|
|
|
zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
|
2019-05-14 01:58:56 +08:00
|
|
|
zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL);
|
2011-01-22 07:29:44 +08:00
|
|
|
zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
|
2021-09-24 16:51:04 +08:00
|
|
|
zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL);
|
2009-03-13 12:19:50 +08:00
|
|
|
}
|
2016-09-22 03:19:03 +08:00
|
|
|
|
2008-03-20 01:26:11 +08:00
|
|
|
set_cpu_sibling_map(0);
|
2021-11-03 07:36:36 +08:00
|
|
|
}
|
|
|
|
|
2023-05-13 05:07:56 +08:00
|
|
|
#ifdef CONFIG_X86_64
|
|
|
|
/* Establish whether parallel bringup can be supported. */
|
|
|
|
bool __init arch_cpuhp_init_parallel_bringup(void)
|
|
|
|
{
|
2023-05-31 15:44:26 +08:00
|
|
|
if (!x86_cpuinit.parallel_bringup) {
|
|
|
|
pr_info("Parallel CPU startup disabled by the platform\n");
|
2023-05-13 05:07:56 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
smpboot_control = STARTUP_READ_APICID;
|
|
|
|
pr_debug("Parallel CPU startup enabled: 0x%08x\n", smpboot_control);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2021-11-03 07:36:36 +08:00
|
|
|
/*
|
|
|
|
* Prepare for SMP bootup.
|
|
|
|
* @max_cpus: configured maximum number of CPUs, It is a legacy parameter
|
|
|
|
* for common interface support.
|
|
|
|
*/
|
|
|
|
void __init native_smp_prepare_cpus(unsigned int max_cpus)
|
|
|
|
{
|
|
|
|
smp_prepare_cpus_common();
|
|
|
|
|
2017-09-13 17:12:49 +08:00
|
|
|
switch (apic_intr_mode) {
|
|
|
|
case APIC_PIC:
|
|
|
|
case APIC_VIRTUAL_WIRE_NO_CONFIG:
|
2015-01-16 05:22:42 +08:00
|
|
|
disable_smp();
|
|
|
|
return;
|
2017-09-13 17:12:49 +08:00
|
|
|
case APIC_SYMMETRIC_IO_NO_ROUTING:
|
2015-01-16 05:22:42 +08:00
|
|
|
disable_smp();
|
2017-09-13 17:12:46 +08:00
|
|
|
/* Setup local timer */
|
|
|
|
x86_init.timers.setup_percpu_clockev();
|
2014-12-05 16:48:29 +08:00
|
|
|
return;
|
2017-09-13 17:12:49 +08:00
|
|
|
case APIC_VIRTUAL_WIRE:
|
|
|
|
case APIC_SYMMETRIC_IO:
|
2015-01-16 05:22:42 +08:00
|
|
|
break;
|
2008-03-20 01:26:11 +08:00
|
|
|
}
|
|
|
|
|
2017-09-13 17:12:46 +08:00
|
|
|
/* Setup local timer */
|
|
|
|
x86_init.timers.setup_percpu_clockev();
|
2008-03-20 01:26:11 +08:00
|
|
|
|
2016-10-25 01:38:42 +08:00
|
|
|
pr_info("CPU0: ");
|
2008-03-20 01:26:11 +08:00
|
|
|
print_cpu_info(&cpu_data(0));
|
2008-08-22 02:49:05 +08:00
|
|
|
|
2017-01-26 00:35:23 +08:00
|
|
|
uv_system_init();
|
2009-08-20 09:05:36 +08:00
|
|
|
|
2015-05-12 05:27:10 +08:00
|
|
|
smp_quirk_init_udelay();
|
2018-05-10 03:53:09 +08:00
|
|
|
|
|
|
|
speculative_store_bypass_ht_init();
|
2022-03-08 05:33:32 +08:00
|
|
|
|
|
|
|
snp_set_wakeup_secondary_cpu();
|
2008-03-20 01:26:11 +08:00
|
|
|
}
|
2009-08-20 09:05:36 +08:00
|
|
|
|
2020-04-30 19:40:03 +08:00
|
|
|
void arch_thaw_secondary_cpus_begin(void)
|
2009-08-20 09:05:36 +08:00
|
|
|
{
|
2022-11-02 15:47:08 +08:00
|
|
|
set_cache_aps_delayed_init(true);
|
2009-08-20 09:05:36 +08:00
|
|
|
}
|
|
|
|
|
2020-04-30 19:40:03 +08:00
|
|
|
void arch_thaw_secondary_cpus_end(void)
|
2009-08-20 09:05:36 +08:00
|
|
|
{
|
2022-11-02 15:47:09 +08:00
|
|
|
cache_aps_init();
|
2009-08-20 09:05:36 +08:00
|
|
|
}
|
|
|
|
|
2008-03-20 01:26:01 +08:00
|
|
|
/*
|
|
|
|
* Early setup to make printk work.
|
|
|
|
*/
|
|
|
|
void __init native_smp_prepare_boot_cpu(void)
|
|
|
|
{
|
|
|
|
int me = smp_processor_id();
|
2022-09-15 19:10:42 +08:00
|
|
|
|
|
|
|
/* SMP handles this from setup_per_cpu_areas() */
|
|
|
|
if (!IS_ENABLED(CONFIG_SMP))
|
|
|
|
switch_gdt_and_percpu_base(me);
|
|
|
|
|
2019-06-26 16:57:09 +08:00
|
|
|
native_pv_lock_init();
|
2008-03-20 01:26:01 +08:00
|
|
|
}
|
|
|
|
|
2018-02-08 07:49:23 +08:00
|
|
|
void __init calculate_max_logical_packages(void)
|
2008-03-20 01:26:02 +08:00
|
|
|
{
|
2017-11-14 20:42:57 +08:00
|
|
|
int ncpus;
|
|
|
|
|
|
|
|
/*
|
2021-03-18 22:28:01 +08:00
|
|
|
* Today neither Intel nor AMD support heterogeneous systems so
|
2017-11-14 20:42:57 +08:00
|
|
|
* extrapolate the boot cpu's data to all packages.
|
|
|
|
*/
|
2017-12-05 00:45:21 +08:00
|
|
|
ncpus = cpu_data(0).booted_cores * topology_max_smt_threads();
|
2018-11-07 10:36:43 +08:00
|
|
|
__max_logical_packages = DIV_ROUND_UP(total_cpus, ncpus);
|
2017-11-14 20:42:57 +08:00
|
|
|
pr_info("Max logical packages: %u\n", __max_logical_packages);
|
2018-02-08 07:49:23 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void __init native_smp_cpus_done(unsigned int max_cpus)
|
|
|
|
{
|
|
|
|
pr_debug("Boot done\n");
|
|
|
|
|
|
|
|
calculate_max_logical_packages();
|
2023-06-02 00:00:25 +08:00
|
|
|
build_sched_topology();
|
2011-10-14 03:14:26 +08:00
|
|
|
nmi_selftest();
|
2008-03-20 01:26:02 +08:00
|
|
|
impress_friends();
|
2022-11-02 15:47:09 +08:00
|
|
|
cache_aps_init();
|
2008-03-20 01:26:02 +08:00
|
|
|
}
|
|
|
|
|
2008-12-18 07:21:39 +08:00
|
|
|
static int __initdata setup_possible_cpus = -1;
|
|
|
|
static int __init _setup_possible_cpus(char *str)
|
|
|
|
{
|
|
|
|
get_option(&str, &setup_possible_cpus);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
early_param("possible_cpus", _setup_possible_cpus);
|
|
|
|
|
|
|
|
|
2008-03-04 01:12:42 +08:00
|
|
|
/*
|
2009-03-13 12:19:54 +08:00
|
|
|
* cpu_possible_mask should be static, it cannot change as cpu's
|
2008-03-04 01:12:42 +08:00
|
|
|
* are onlined, or offlined. The reason is per-cpu data-structures
|
2020-02-16 23:17:39 +08:00
|
|
|
* are allocated by some modules at init time, and don't expect to
|
2008-03-04 01:12:42 +08:00
|
|
|
* do this dynamically on cpu arrival/departure.
|
2009-03-13 12:19:54 +08:00
|
|
|
* cpu_present_mask on the other hand can change dynamically.
|
2008-03-04 01:12:42 +08:00
|
|
|
* In case when cpu_hotplug is not compiled, then we resort to current
|
|
|
|
* behaviour, which is cpu_possible == cpu_present.
|
|
|
|
* - Ashok Raj
|
|
|
|
*
|
|
|
|
* Three ways to find out the number of additional hotplug CPUs:
|
|
|
|
* - If the BIOS specified disabled CPUs in ACPI/mptables use that.
|
2008-12-18 07:21:39 +08:00
|
|
|
* - The user can overwrite it with possible_cpus=NUM
|
2008-03-04 01:12:42 +08:00
|
|
|
* - Otherwise don't reserve additional CPUs.
|
|
|
|
* We do this because additional CPUs waste a lot of memory.
|
|
|
|
* -AK
|
|
|
|
*/
|
|
|
|
__init void prefill_possible_map(void)
|
|
|
|
{
|
2008-10-05 23:51:52 +08:00
|
|
|
int i, possible;
|
2008-03-04 01:12:42 +08:00
|
|
|
|
2010-05-25 03:13:17 +08:00
|
|
|
i = setup_max_cpus ?: 1;
|
|
|
|
if (setup_possible_cpus == -1) {
|
|
|
|
possible = num_processors;
|
|
|
|
#ifdef CONFIG_HOTPLUG_CPU
|
|
|
|
if (setup_max_cpus)
|
|
|
|
possible += disabled_cpus;
|
|
|
|
#else
|
|
|
|
if (possible > i)
|
|
|
|
possible = i;
|
|
|
|
#endif
|
|
|
|
} else
|
2008-12-18 07:21:39 +08:00
|
|
|
possible = setup_possible_cpus;
|
|
|
|
|
2009-01-01 10:08:45 +08:00
|
|
|
total_cpus = max_t(int, possible, num_processors + disabled_cpus);
|
|
|
|
|
2010-02-10 17:20:37 +08:00
|
|
|
/* nr_cpu_ids could be reduced via nr_cpus= */
|
|
|
|
if (possible > nr_cpu_ids) {
|
2017-09-09 07:14:18 +08:00
|
|
|
pr_warn("%d Processors exceeds NR_CPUS limit of %u\n",
|
2010-02-10 17:20:37 +08:00
|
|
|
possible, nr_cpu_ids);
|
|
|
|
possible = nr_cpu_ids;
|
2008-12-18 07:21:39 +08:00
|
|
|
}
|
2008-03-04 01:12:42 +08:00
|
|
|
|
2010-05-25 03:13:17 +08:00
|
|
|
#ifdef CONFIG_HOTPLUG_CPU
|
|
|
|
if (!setup_max_cpus)
|
|
|
|
#endif
|
|
|
|
if (possible > i) {
|
2012-05-22 10:50:07 +08:00
|
|
|
pr_warn("%d Processors exceeds max_cpus limit of %u\n",
|
2010-05-25 03:13:17 +08:00
|
|
|
possible, setup_max_cpus);
|
|
|
|
possible = i;
|
|
|
|
}
|
|
|
|
|
2022-09-06 07:08:17 +08:00
|
|
|
set_nr_cpu_ids(possible);
|
2016-12-14 02:32:28 +08:00
|
|
|
|
2012-05-22 10:50:07 +08:00
|
|
|
pr_info("Allowing %d CPUs, %d hotplug CPUs\n",
|
2008-03-04 01:12:42 +08:00
|
|
|
possible, max_t(int, possible - num_processors, 0));
|
|
|
|
|
2016-12-14 02:32:28 +08:00
|
|
|
reset_cpu_possible_mask();
|
|
|
|
|
2008-03-04 01:12:42 +08:00
|
|
|
for (i = 0; i < possible; i++)
|
2009-01-04 21:18:03 +08:00
|
|
|
set_cpu_possible(i, true);
|
2008-03-04 01:12:42 +08:00
|
|
|
}
|
2008-03-04 01:13:07 +08:00
|
|
|
|
2023-05-13 05:07:16 +08:00
|
|
|
/* correctly size the local cpu masks */
|
|
|
|
void __init setup_cpu_local_masks(void)
|
|
|
|
{
|
|
|
|
alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
|
|
|
|
}
|
|
|
|
|
2008-09-30 06:29:42 +08:00
|
|
|
#ifdef CONFIG_HOTPLUG_CPU
|
|
|
|
|
2016-05-20 08:09:55 +08:00
|
|
|
/* Recompute SMT state for all CPUs on offline */
|
|
|
|
static void recompute_smt_state(void)
|
|
|
|
{
|
|
|
|
int max_threads, cpu;
|
|
|
|
|
|
|
|
max_threads = 0;
|
|
|
|
for_each_online_cpu (cpu) {
|
|
|
|
int threads = cpumask_weight(topology_sibling_cpumask(cpu));
|
|
|
|
|
|
|
|
if (threads > max_threads)
|
|
|
|
max_threads = threads;
|
|
|
|
}
|
|
|
|
__max_smt_threads = max_threads;
|
|
|
|
}
|
|
|
|
|
2008-09-30 06:29:42 +08:00
|
|
|
static void remove_siblinginfo(int cpu)
|
|
|
|
{
|
|
|
|
int sibling;
|
|
|
|
struct cpuinfo_x86 *c = &cpu_data(cpu);
|
|
|
|
|
2015-05-26 21:11:35 +08:00
|
|
|
for_each_cpu(sibling, topology_core_cpumask(cpu)) {
|
|
|
|
cpumask_clear_cpu(cpu, topology_core_cpumask(sibling));
|
2008-09-30 06:29:42 +08:00
|
|
|
/*/
|
|
|
|
* last thread sibling in this cpu core going down
|
|
|
|
*/
|
2015-05-26 21:11:35 +08:00
|
|
|
if (cpumask_weight(topology_sibling_cpumask(cpu)) == 1)
|
2008-09-30 06:29:42 +08:00
|
|
|
cpu_data(sibling).booted_cores--;
|
|
|
|
}
|
|
|
|
|
2019-05-14 01:58:56 +08:00
|
|
|
for_each_cpu(sibling, topology_die_cpumask(cpu))
|
|
|
|
cpumask_clear_cpu(cpu, topology_die_cpumask(sibling));
|
2021-01-08 20:10:52 +08:00
|
|
|
|
|
|
|
for_each_cpu(sibling, topology_sibling_cpumask(cpu)) {
|
2015-05-26 21:11:35 +08:00
|
|
|
cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling));
|
2021-01-08 20:10:52 +08:00
|
|
|
if (cpumask_weight(topology_sibling_cpumask(sibling)) == 1)
|
|
|
|
cpu_data(sibling).smt_active = false;
|
|
|
|
}
|
|
|
|
|
sched: Fix unreleased llc_shared_mask bit during CPU hotplug
The following bug can be triggered by hot adding and removing a large number of
xen domain0's vcpus repeatedly:
BUG: unable to handle kernel NULL pointer dereference at 0000000000000004 IP: [..] find_busiest_group
PGD 5a9d5067 PUD 13067 PMD 0
Oops: 0000 [#3] SMP
[...]
Call Trace:
load_balance
? _raw_spin_unlock_irqrestore
idle_balance
__schedule
schedule
schedule_timeout
? lock_timer_base
schedule_timeout_uninterruptible
msleep
lock_device_hotplug_sysfs
online_store
dev_attr_store
sysfs_write_file
vfs_write
SyS_write
system_call_fastpath
Last level cache shared mask is built during CPU up and the
build_sched_domain() routine takes advantage of it to setup
the sched domain CPU topology.
However, llc_shared_mask is not released during CPU disable,
which leads to an invalid sched domainCPU topology.
This patch fix it by releasing the llc_shared_mask correctly
during CPU disable.
Yasuaki also reported that this can happen on real hardware:
https://lkml.org/lkml/2014/7/22/1018
His case is here:
==
Here is an example on my system.
My system has 4 sockets and each socket has 15 cores and HT is
enabled. In this case, each core of sockes is numbered as
follows:
| CPU#
Socket#0 | 0-14 , 60-74
Socket#1 | 15-29, 75-89
Socket#2 | 30-44, 90-104
Socket#3 | 45-59, 105-119
Then llc_shared_mask of CPU#30 has 0x3fff80000001fffc0000000.
It means that last level cache of Socket#2 is shared with
CPU#30-44 and 90-104.
When hot-removing socket#2 and #3, each core of sockets is
numbered as follows:
| CPU#
Socket#0 | 0-14 , 60-74
Socket#1 | 15-29, 75-89
But llc_shared_mask is not cleared. So llc_shared_mask of CPU#30
remains having 0x3fff80000001fffc0000000.
After that, when hot-adding socket#2 and #3, each core of
sockets is numbered as follows:
| CPU#
Socket#0 | 0-14 , 60-74
Socket#1 | 15-29, 75-89
Socket#2 | 30-59
Socket#3 | 90-119
Then llc_shared_mask of CPU#30 becomes
0x3fff8000fffffffc0000000. It means that last level cache of
Socket#2 is shared with CPU#30-59 and 90-104. So the mask has
the wrong value.
Signed-off-by: Wanpeng Li <wanpeng.li@linux.intel.com>
Tested-by: Linn Crosetto <linn@hp.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Toshi Kani <toshi.kani@hp.com>
Reviewed-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: <stable@vger.kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1411547885-48165-1-git-send-email-wanpeng.li@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-09-24 16:38:05 +08:00
|
|
|
for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
|
|
|
|
cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling));
|
2021-09-24 16:51:04 +08:00
|
|
|
for_each_cpu(sibling, cpu_l2c_shared_mask(cpu))
|
|
|
|
cpumask_clear_cpu(cpu, cpu_l2c_shared_mask(sibling));
|
sched: Fix unreleased llc_shared_mask bit during CPU hotplug
The following bug can be triggered by hot adding and removing a large number of
xen domain0's vcpus repeatedly:
BUG: unable to handle kernel NULL pointer dereference at 0000000000000004 IP: [..] find_busiest_group
PGD 5a9d5067 PUD 13067 PMD 0
Oops: 0000 [#3] SMP
[...]
Call Trace:
load_balance
? _raw_spin_unlock_irqrestore
idle_balance
__schedule
schedule
schedule_timeout
? lock_timer_base
schedule_timeout_uninterruptible
msleep
lock_device_hotplug_sysfs
online_store
dev_attr_store
sysfs_write_file
vfs_write
SyS_write
system_call_fastpath
Last level cache shared mask is built during CPU up and the
build_sched_domain() routine takes advantage of it to setup
the sched domain CPU topology.
However, llc_shared_mask is not released during CPU disable,
which leads to an invalid sched domainCPU topology.
This patch fix it by releasing the llc_shared_mask correctly
during CPU disable.
Yasuaki also reported that this can happen on real hardware:
https://lkml.org/lkml/2014/7/22/1018
His case is here:
==
Here is an example on my system.
My system has 4 sockets and each socket has 15 cores and HT is
enabled. In this case, each core of sockes is numbered as
follows:
| CPU#
Socket#0 | 0-14 , 60-74
Socket#1 | 15-29, 75-89
Socket#2 | 30-44, 90-104
Socket#3 | 45-59, 105-119
Then llc_shared_mask of CPU#30 has 0x3fff80000001fffc0000000.
It means that last level cache of Socket#2 is shared with
CPU#30-44 and 90-104.
When hot-removing socket#2 and #3, each core of sockets is
numbered as follows:
| CPU#
Socket#0 | 0-14 , 60-74
Socket#1 | 15-29, 75-89
But llc_shared_mask is not cleared. So llc_shared_mask of CPU#30
remains having 0x3fff80000001fffc0000000.
After that, when hot-adding socket#2 and #3, each core of
sockets is numbered as follows:
| CPU#
Socket#0 | 0-14 , 60-74
Socket#1 | 15-29, 75-89
Socket#2 | 30-59
Socket#3 | 90-119
Then llc_shared_mask of CPU#30 becomes
0x3fff8000fffffffc0000000. It means that last level cache of
Socket#2 is shared with CPU#30-59 and 90-104. So the mask has
the wrong value.
Signed-off-by: Wanpeng Li <wanpeng.li@linux.intel.com>
Tested-by: Linn Crosetto <linn@hp.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Toshi Kani <toshi.kani@hp.com>
Reviewed-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: <stable@vger.kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1411547885-48165-1-git-send-email-wanpeng.li@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-09-24 16:38:05 +08:00
|
|
|
cpumask_clear(cpu_llc_shared_mask(cpu));
|
2021-09-24 16:51:04 +08:00
|
|
|
cpumask_clear(cpu_l2c_shared_mask(cpu));
|
2015-05-26 21:11:35 +08:00
|
|
|
cpumask_clear(topology_sibling_cpumask(cpu));
|
|
|
|
cpumask_clear(topology_core_cpumask(cpu));
|
2019-05-14 01:58:56 +08:00
|
|
|
cpumask_clear(topology_die_cpumask(cpu));
|
2008-09-30 06:29:42 +08:00
|
|
|
c->cpu_core_id = 0;
|
2018-02-22 04:50:36 +08:00
|
|
|
c->booted_cores = 0;
|
2009-01-04 21:18:03 +08:00
|
|
|
cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
|
2016-05-20 08:09:55 +08:00
|
|
|
recompute_smt_state();
|
2008-09-30 06:29:42 +08:00
|
|
|
}
|
|
|
|
|
2015-07-21 00:32:53 +08:00
|
|
|
static void remove_cpu_from_maps(int cpu)
|
2008-03-04 01:13:07 +08:00
|
|
|
{
|
2009-01-04 21:18:03 +08:00
|
|
|
set_cpu_online(cpu, false);
|
x86: cleanup early per cpu variables/accesses v4
* Introduce a new PER_CPU macro called "EARLY_PER_CPU". This is
used by some per_cpu variables that are initialized and accessed
before there are per_cpu areas allocated.
["Early" in respect to per_cpu variables is "earlier than the per_cpu
areas have been setup".]
This patchset adds these new macros:
DEFINE_EARLY_PER_CPU(_type, _name, _initvalue)
EXPORT_EARLY_PER_CPU_SYMBOL(_name)
DECLARE_EARLY_PER_CPU(_type, _name)
early_per_cpu_ptr(_name)
early_per_cpu_map(_name, _idx)
early_per_cpu(_name, _cpu)
The DEFINE macro defines the per_cpu variable as well as the early
map and pointer. It also initializes the per_cpu variable and map
elements to "_initvalue". The early_* macros provide access to
the initial map (usually setup during system init) and the early
pointer. This pointer is initialized to point to the early map
but is then NULL'ed when the actual per_cpu areas are setup. After
that the per_cpu variable is the correct access to the variable.
The early_per_cpu() macro is not very efficient but does show how to
access the variable if you have a function that can be called both
"early" and "late". It tests the early ptr to be NULL, and if not
then it's still valid. Otherwise, the per_cpu variable is used
instead:
#define early_per_cpu(_name, _cpu) \
(early_per_cpu_ptr(_name) ? \
early_per_cpu_ptr(_name)[_cpu] : \
per_cpu(_name, _cpu))
A better method is to actually check the pointer manually. In the
case below, numa_set_node can be called both "early" and "late":
void __cpuinit numa_set_node(int cpu, int node)
{
int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
if (cpu_to_node_map)
cpu_to_node_map[cpu] = node;
else
per_cpu(x86_cpu_to_node_map, cpu) = node;
}
* Add a flag "arch_provides_topology_pointers" that indicates pointers
to topology cpumask_t maps are available. Otherwise, use the function
returning the cpumask_t value. This is useful if cpumask_t set size
is very large to avoid copying data on to/off of the stack.
* The coverage of CONFIG_DEBUG_PER_CPU_MAPS has been increased while
the non-debug case has been optimized a bit.
* Remove an unreferenced compiler warning in drivers/base/topology.c
* Clean up #ifdef in setup.c
For inclusion into sched-devel/latest tree.
Based on:
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
+ sched-devel/latest .../mingo/linux-2.6-sched-devel.git
Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2008-05-13 03:21:12 +08:00
|
|
|
numa_remove_cpu(cpu);
|
2008-03-04 01:13:07 +08:00
|
|
|
}
|
|
|
|
|
2008-08-22 18:52:14 +08:00
|
|
|
void cpu_disable_common(void)
|
2008-03-04 01:13:07 +08:00
|
|
|
{
|
|
|
|
int cpu = smp_processor_id();
|
|
|
|
|
|
|
|
remove_siblinginfo(cpu);
|
|
|
|
|
|
|
|
/* It's now safe to remove this processor from the online map */
|
2008-08-10 06:09:02 +08:00
|
|
|
lock_vector_lock();
|
2008-03-04 01:13:07 +08:00
|
|
|
remove_cpu_from_maps(cpu);
|
2008-08-10 06:09:02 +08:00
|
|
|
unlock_vector_lock();
|
2008-12-17 09:33:58 +08:00
|
|
|
fixup_irqs();
|
2017-09-14 05:29:38 +08:00
|
|
|
lapic_offline();
|
2008-08-22 18:52:14 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
int native_cpu_disable(void)
|
|
|
|
{
|
x86: Add check for number of available vectors before CPU down
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=64791
When a cpu is downed on a system, the irqs on the cpu are assigned to
other cpus. It is possible, however, that when a cpu is downed there
aren't enough free vectors on the remaining cpus to account for the
vectors from the cpu that is being downed.
This results in an interesting "overflow" condition where irqs are
"assigned" to a CPU but are not handled.
For example, when downing cpus on a 1-64 logical processor system:
<snip>
[ 232.021745] smpboot: CPU 61 is now offline
[ 238.480275] smpboot: CPU 62 is now offline
[ 245.991080] ------------[ cut here ]------------
[ 245.996270] WARNING: CPU: 0 PID: 0 at net/sched/sch_generic.c:264 dev_watchdog+0x246/0x250()
[ 246.005688] NETDEV WATCHDOG: p786p1 (ixgbe): transmit queue 0 timed out
[ 246.013070] Modules linked in: lockd sunrpc iTCO_wdt iTCO_vendor_support sb_edac ixgbe microcode e1000e pcspkr joydev edac_core lpc_ich ioatdma ptp mdio mfd_core i2c_i801 dca pps_core i2c_core wmi acpi_cpufreq isci libsas scsi_transport_sas
[ 246.037633] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 3.12.0+ #14
[ 246.044451] Hardware name: Intel Corporation S4600LH ........../SVRBD-ROW_T, BIOS SE5C600.86B.01.08.0003.022620131521 02/26/2013
[ 246.057371] 0000000000000009 ffff88081fa03d40 ffffffff8164fbf6 ffff88081fa0ee48
[ 246.065728] ffff88081fa03d90 ffff88081fa03d80 ffffffff81054ecc ffff88081fa13040
[ 246.074073] 0000000000000000 ffff88200cce0000 0000000000000040 0000000000000000
[ 246.082430] Call Trace:
[ 246.085174] <IRQ> [<ffffffff8164fbf6>] dump_stack+0x46/0x58
[ 246.091633] [<ffffffff81054ecc>] warn_slowpath_common+0x8c/0xc0
[ 246.098352] [<ffffffff81054fb6>] warn_slowpath_fmt+0x46/0x50
[ 246.104786] [<ffffffff815710d6>] dev_watchdog+0x246/0x250
[ 246.110923] [<ffffffff81570e90>] ? dev_deactivate_queue.constprop.31+0x80/0x80
[ 246.119097] [<ffffffff8106092a>] call_timer_fn+0x3a/0x110
[ 246.125224] [<ffffffff8106280f>] ? update_process_times+0x6f/0x80
[ 246.132137] [<ffffffff81570e90>] ? dev_deactivate_queue.constprop.31+0x80/0x80
[ 246.140308] [<ffffffff81061db0>] run_timer_softirq+0x1f0/0x2a0
[ 246.146933] [<ffffffff81059a80>] __do_softirq+0xe0/0x220
[ 246.152976] [<ffffffff8165fedc>] call_softirq+0x1c/0x30
[ 246.158920] [<ffffffff810045f5>] do_softirq+0x55/0x90
[ 246.164670] [<ffffffff81059d35>] irq_exit+0xa5/0xb0
[ 246.170227] [<ffffffff8166062a>] smp_apic_timer_interrupt+0x4a/0x60
[ 246.177324] [<ffffffff8165f40a>] apic_timer_interrupt+0x6a/0x70
[ 246.184041] <EOI> [<ffffffff81505a1b>] ? cpuidle_enter_state+0x5b/0xe0
[ 246.191559] [<ffffffff81505a17>] ? cpuidle_enter_state+0x57/0xe0
[ 246.198374] [<ffffffff81505b5d>] cpuidle_idle_call+0xbd/0x200
[ 246.204900] [<ffffffff8100b7ae>] arch_cpu_idle+0xe/0x30
[ 246.210846] [<ffffffff810a47b0>] cpu_startup_entry+0xd0/0x250
[ 246.217371] [<ffffffff81646b47>] rest_init+0x77/0x80
[ 246.223028] [<ffffffff81d09e8e>] start_kernel+0x3ee/0x3fb
[ 246.229165] [<ffffffff81d0989f>] ? repair_env_string+0x5e/0x5e
[ 246.235787] [<ffffffff81d095a5>] x86_64_start_reservations+0x2a/0x2c
[ 246.242990] [<ffffffff81d0969f>] x86_64_start_kernel+0xf8/0xfc
[ 246.249610] ---[ end trace fb74fdef54d79039 ]---
[ 246.254807] ixgbe 0000:c2:00.0 p786p1: initiating reset due to tx timeout
[ 246.262489] ixgbe 0000:c2:00.0 p786p1: Reset adapter
Last login: Mon Nov 11 08:35:14 from 10.18.17.119
[root@(none) ~]# [ 246.792676] ixgbe 0000:c2:00.0 p786p1: detected SFP+: 5
[ 249.231598] ixgbe 0000:c2:00.0 p786p1: NIC Link is Up 10 Gbps, Flow Control: RX/TX
[ 246.792676] ixgbe 0000:c2:00.0 p786p1: detected SFP+: 5
[ 249.231598] ixgbe 0000:c2:00.0 p786p1: NIC Link is Up 10 Gbps, Flow Control: RX/TX
(last lines keep repeating. ixgbe driver is dead until module reload.)
If the downed cpu has more vectors than are free on the remaining cpus on the
system, it is possible that some vectors are "orphaned" even though they are
assigned to a cpu. In this case, since the ixgbe driver had a watchdog, the
watchdog fired and notified that something was wrong.
This patch adds a function, check_vectors(), to compare the number of vectors
on the CPU going down and compares it to the number of vectors available on
the system. If there aren't enough vectors for the CPU to go down, an
error is returned and propogated back to userspace.
v2: Do not need to look at percpu irqs
v3: Need to check affinity to prevent counting of MSIs in IOAPIC Lowest
Priority Mode
v4: Additional changes suggested by Gong Chen.
v5/v6/v7/v8: Updated comment text
Signed-off-by: Prarit Bhargava <prarit@redhat.com>
Link: http://lkml.kernel.org/r/1389613861-3853-1-git-send-email-prarit@redhat.com
Reviewed-by: Gong Chen <gong.chen@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Seiji Aguchi <seiji.aguchi@hds.com>
Cc: Yang Zhang <yang.z.zhang@Intel.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Janet Morgan <janet.morgan@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Ruiv Wang <ruiv.wang@gmail.com>
Cc: Gong Chen <gong.chen@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: <stable@vger.kernel.org>
2014-01-13 19:51:01 +08:00
|
|
|
int ret;
|
|
|
|
|
2017-09-14 05:29:53 +08:00
|
|
|
ret = lapic_can_unplug_cpu();
|
x86: Add check for number of available vectors before CPU down
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=64791
When a cpu is downed on a system, the irqs on the cpu are assigned to
other cpus. It is possible, however, that when a cpu is downed there
aren't enough free vectors on the remaining cpus to account for the
vectors from the cpu that is being downed.
This results in an interesting "overflow" condition where irqs are
"assigned" to a CPU but are not handled.
For example, when downing cpus on a 1-64 logical processor system:
<snip>
[ 232.021745] smpboot: CPU 61 is now offline
[ 238.480275] smpboot: CPU 62 is now offline
[ 245.991080] ------------[ cut here ]------------
[ 245.996270] WARNING: CPU: 0 PID: 0 at net/sched/sch_generic.c:264 dev_watchdog+0x246/0x250()
[ 246.005688] NETDEV WATCHDOG: p786p1 (ixgbe): transmit queue 0 timed out
[ 246.013070] Modules linked in: lockd sunrpc iTCO_wdt iTCO_vendor_support sb_edac ixgbe microcode e1000e pcspkr joydev edac_core lpc_ich ioatdma ptp mdio mfd_core i2c_i801 dca pps_core i2c_core wmi acpi_cpufreq isci libsas scsi_transport_sas
[ 246.037633] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 3.12.0+ #14
[ 246.044451] Hardware name: Intel Corporation S4600LH ........../SVRBD-ROW_T, BIOS SE5C600.86B.01.08.0003.022620131521 02/26/2013
[ 246.057371] 0000000000000009 ffff88081fa03d40 ffffffff8164fbf6 ffff88081fa0ee48
[ 246.065728] ffff88081fa03d90 ffff88081fa03d80 ffffffff81054ecc ffff88081fa13040
[ 246.074073] 0000000000000000 ffff88200cce0000 0000000000000040 0000000000000000
[ 246.082430] Call Trace:
[ 246.085174] <IRQ> [<ffffffff8164fbf6>] dump_stack+0x46/0x58
[ 246.091633] [<ffffffff81054ecc>] warn_slowpath_common+0x8c/0xc0
[ 246.098352] [<ffffffff81054fb6>] warn_slowpath_fmt+0x46/0x50
[ 246.104786] [<ffffffff815710d6>] dev_watchdog+0x246/0x250
[ 246.110923] [<ffffffff81570e90>] ? dev_deactivate_queue.constprop.31+0x80/0x80
[ 246.119097] [<ffffffff8106092a>] call_timer_fn+0x3a/0x110
[ 246.125224] [<ffffffff8106280f>] ? update_process_times+0x6f/0x80
[ 246.132137] [<ffffffff81570e90>] ? dev_deactivate_queue.constprop.31+0x80/0x80
[ 246.140308] [<ffffffff81061db0>] run_timer_softirq+0x1f0/0x2a0
[ 246.146933] [<ffffffff81059a80>] __do_softirq+0xe0/0x220
[ 246.152976] [<ffffffff8165fedc>] call_softirq+0x1c/0x30
[ 246.158920] [<ffffffff810045f5>] do_softirq+0x55/0x90
[ 246.164670] [<ffffffff81059d35>] irq_exit+0xa5/0xb0
[ 246.170227] [<ffffffff8166062a>] smp_apic_timer_interrupt+0x4a/0x60
[ 246.177324] [<ffffffff8165f40a>] apic_timer_interrupt+0x6a/0x70
[ 246.184041] <EOI> [<ffffffff81505a1b>] ? cpuidle_enter_state+0x5b/0xe0
[ 246.191559] [<ffffffff81505a17>] ? cpuidle_enter_state+0x57/0xe0
[ 246.198374] [<ffffffff81505b5d>] cpuidle_idle_call+0xbd/0x200
[ 246.204900] [<ffffffff8100b7ae>] arch_cpu_idle+0xe/0x30
[ 246.210846] [<ffffffff810a47b0>] cpu_startup_entry+0xd0/0x250
[ 246.217371] [<ffffffff81646b47>] rest_init+0x77/0x80
[ 246.223028] [<ffffffff81d09e8e>] start_kernel+0x3ee/0x3fb
[ 246.229165] [<ffffffff81d0989f>] ? repair_env_string+0x5e/0x5e
[ 246.235787] [<ffffffff81d095a5>] x86_64_start_reservations+0x2a/0x2c
[ 246.242990] [<ffffffff81d0969f>] x86_64_start_kernel+0xf8/0xfc
[ 246.249610] ---[ end trace fb74fdef54d79039 ]---
[ 246.254807] ixgbe 0000:c2:00.0 p786p1: initiating reset due to tx timeout
[ 246.262489] ixgbe 0000:c2:00.0 p786p1: Reset adapter
Last login: Mon Nov 11 08:35:14 from 10.18.17.119
[root@(none) ~]# [ 246.792676] ixgbe 0000:c2:00.0 p786p1: detected SFP+: 5
[ 249.231598] ixgbe 0000:c2:00.0 p786p1: NIC Link is Up 10 Gbps, Flow Control: RX/TX
[ 246.792676] ixgbe 0000:c2:00.0 p786p1: detected SFP+: 5
[ 249.231598] ixgbe 0000:c2:00.0 p786p1: NIC Link is Up 10 Gbps, Flow Control: RX/TX
(last lines keep repeating. ixgbe driver is dead until module reload.)
If the downed cpu has more vectors than are free on the remaining cpus on the
system, it is possible that some vectors are "orphaned" even though they are
assigned to a cpu. In this case, since the ixgbe driver had a watchdog, the
watchdog fired and notified that something was wrong.
This patch adds a function, check_vectors(), to compare the number of vectors
on the CPU going down and compares it to the number of vectors available on
the system. If there aren't enough vectors for the CPU to go down, an
error is returned and propogated back to userspace.
v2: Do not need to look at percpu irqs
v3: Need to check affinity to prevent counting of MSIs in IOAPIC Lowest
Priority Mode
v4: Additional changes suggested by Gong Chen.
v5/v6/v7/v8: Updated comment text
Signed-off-by: Prarit Bhargava <prarit@redhat.com>
Link: http://lkml.kernel.org/r/1389613861-3853-1-git-send-email-prarit@redhat.com
Reviewed-by: Gong Chen <gong.chen@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Seiji Aguchi <seiji.aguchi@hds.com>
Cc: Yang Zhang <yang.z.zhang@Intel.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Janet Morgan <janet.morgan@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Ruiv Wang <ruiv.wang@gmail.com>
Cc: Gong Chen <gong.chen@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: <stable@vger.kernel.org>
2014-01-13 19:51:01 +08:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2008-08-22 18:52:14 +08:00
|
|
|
cpu_disable_common();
|
2014-08-26 15:43:45 +08:00
|
|
|
|
2020-08-27 12:12:10 +08:00
|
|
|
/*
|
|
|
|
* Disable the local APIC. Otherwise IPI broadcasts will reach
|
|
|
|
* it. It still responds normally to INIT, NMI, SMI, and SIPI
|
|
|
|
* messages.
|
|
|
|
*
|
|
|
|
* Disabling the APIC must happen after cpu_disable_common()
|
|
|
|
* which invokes fixup_irqs().
|
|
|
|
*
|
|
|
|
* Disabling the APIC preserves already set bits in IRR, but
|
|
|
|
* an interrupt arriving after disabling the local APIC does not
|
|
|
|
* set the corresponding IRR bit.
|
|
|
|
*
|
|
|
|
* fixup_irqs() scans IRR for set bits so it can raise a not
|
|
|
|
* yet handled interrupt on the new destination CPU via an IPI
|
|
|
|
* but obviously it can't do so for IRR bits which are not set.
|
|
|
|
* IOW, interrupts arriving after disabling the local APIC will
|
|
|
|
* be lost.
|
|
|
|
*/
|
|
|
|
apic_soft_disable();
|
|
|
|
|
2008-03-04 01:13:07 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-08-22 18:52:13 +08:00
|
|
|
void play_dead_common(void)
|
|
|
|
{
|
|
|
|
idle_task_exit();
|
|
|
|
|
2023-05-13 05:07:29 +08:00
|
|
|
cpuhp_ap_report_dead();
|
2023-07-28 02:05:31 +08:00
|
|
|
|
2008-08-22 18:52:13 +08:00
|
|
|
local_irq_disable();
|
|
|
|
}
|
|
|
|
|
2010-09-18 06:39:11 +08:00
|
|
|
/*
|
|
|
|
* We need to flush the caches before going to sleep, lest we have
|
|
|
|
* dirty data in our caches when we come back up.
|
|
|
|
*/
|
|
|
|
static inline void mwait_play_dead(void)
|
|
|
|
{
|
2023-06-16 04:33:55 +08:00
|
|
|
struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
|
2010-09-18 06:39:11 +08:00
|
|
|
unsigned int eax, ebx, ecx, edx;
|
|
|
|
unsigned int highest_cstate = 0;
|
|
|
|
unsigned int highest_subcstate = 0;
|
2013-03-05 04:16:16 +08:00
|
|
|
int i;
|
2010-09-18 06:39:11 +08:00
|
|
|
|
2018-09-23 17:34:32 +08:00
|
|
|
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
|
|
|
|
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
|
2018-04-03 22:02:28 +08:00
|
|
|
return;
|
2013-02-10 14:38:39 +08:00
|
|
|
if (!this_cpu_has(X86_FEATURE_MWAIT))
|
2010-09-18 06:39:11 +08:00
|
|
|
return;
|
2014-02-28 00:31:30 +08:00
|
|
|
if (!this_cpu_has(X86_FEATURE_CLFLUSH))
|
2010-09-21 04:04:45 +08:00
|
|
|
return;
|
2010-12-18 23:30:05 +08:00
|
|
|
if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
|
2010-09-18 06:39:11 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
eax = CPUID_MWAIT_LEAF;
|
|
|
|
ecx = 0;
|
|
|
|
native_cpuid(&eax, &ebx, &ecx, &edx);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* eax will be 0 if EDX enumeration is not valid.
|
|
|
|
* Initialized below to cstate, sub_cstate value when EDX is valid.
|
|
|
|
*/
|
|
|
|
if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) {
|
|
|
|
eax = 0;
|
|
|
|
} else {
|
|
|
|
edx >>= MWAIT_SUBSTATE_SIZE;
|
|
|
|
for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
|
|
|
|
if (edx & MWAIT_SUBSTATE_MASK) {
|
|
|
|
highest_cstate = i;
|
|
|
|
highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
|
|
|
|
(highest_subcstate - 1);
|
|
|
|
}
|
|
|
|
|
x86/smp: Cure kexec() vs. mwait_play_dead() breakage
TLDR: It's a mess.
When kexec() is executed on a system with offline CPUs, which are parked in
mwait_play_dead() it can end up in a triple fault during the bootup of the
kexec kernel or cause hard to diagnose data corruption.
The reason is that kexec() eventually overwrites the previous kernel's text,
page tables, data and stack. If it writes to the cache line which is
monitored by a previously offlined CPU, MWAIT resumes execution and ends
up executing the wrong text, dereferencing overwritten page tables or
corrupting the kexec kernels data.
Cure this by bringing the offlined CPUs out of MWAIT into HLT.
Write to the monitored cache line of each offline CPU, which makes MWAIT
resume execution. The written control word tells the offlined CPUs to issue
HLT, which does not have the MWAIT problem.
That does not help, if a stray NMI, MCE or SMI hits the offlined CPUs as
those make it come out of HLT.
A follow up change will put them into INIT, which protects at least against
NMI and SMI.
Fixes: ea53069231f9 ("x86, hotplug: Use mwait to offline a processor, fix the legacy case")
Reported-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Ashok Raj <ashok.raj@intel.com>
Reviewed-by: Ashok Raj <ashok.raj@intel.com>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20230615193330.492257119@linutronix.de
2023-06-16 04:33:57 +08:00
|
|
|
/* Set up state for the kexec() hack below */
|
|
|
|
md->status = CPUDEAD_MWAIT_WAIT;
|
|
|
|
md->control = CPUDEAD_MWAIT_WAIT;
|
2010-09-21 04:04:45 +08:00
|
|
|
|
2010-09-18 08:06:46 +08:00
|
|
|
wbinvd();
|
|
|
|
|
2010-09-18 06:39:11 +08:00
|
|
|
while (1) {
|
2010-09-21 04:04:45 +08:00
|
|
|
/*
|
|
|
|
* The CLFLUSH is a workaround for erratum AAI65 for
|
|
|
|
* the Xeon 7400 series. It's not clear it is actually
|
|
|
|
* needed, but it should be harmless in either case.
|
|
|
|
* The WBINVD is insufficient due to the spurious-wakeup
|
|
|
|
* case where we return around the loop.
|
|
|
|
*/
|
2013-12-20 04:30:03 +08:00
|
|
|
mb();
|
2023-06-16 04:33:55 +08:00
|
|
|
clflush(md);
|
2013-12-20 04:30:03 +08:00
|
|
|
mb();
|
2023-06-16 04:33:55 +08:00
|
|
|
__monitor(md, 0, 0);
|
2010-09-18 06:39:11 +08:00
|
|
|
mb();
|
|
|
|
__mwait(eax, 0);
|
2021-04-06 23:56:40 +08:00
|
|
|
|
x86/smp: Cure kexec() vs. mwait_play_dead() breakage
TLDR: It's a mess.
When kexec() is executed on a system with offline CPUs, which are parked in
mwait_play_dead() it can end up in a triple fault during the bootup of the
kexec kernel or cause hard to diagnose data corruption.
The reason is that kexec() eventually overwrites the previous kernel's text,
page tables, data and stack. If it writes to the cache line which is
monitored by a previously offlined CPU, MWAIT resumes execution and ends
up executing the wrong text, dereferencing overwritten page tables or
corrupting the kexec kernels data.
Cure this by bringing the offlined CPUs out of MWAIT into HLT.
Write to the monitored cache line of each offline CPU, which makes MWAIT
resume execution. The written control word tells the offlined CPUs to issue
HLT, which does not have the MWAIT problem.
That does not help, if a stray NMI, MCE or SMI hits the offlined CPUs as
those make it come out of HLT.
A follow up change will put them into INIT, which protects at least against
NMI and SMI.
Fixes: ea53069231f9 ("x86, hotplug: Use mwait to offline a processor, fix the legacy case")
Reported-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Ashok Raj <ashok.raj@intel.com>
Reviewed-by: Ashok Raj <ashok.raj@intel.com>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20230615193330.492257119@linutronix.de
2023-06-16 04:33:57 +08:00
|
|
|
if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
|
|
|
|
/*
|
|
|
|
* Kexec is about to happen. Don't go back into mwait() as
|
|
|
|
* the kexec kernel might overwrite text and data including
|
|
|
|
* page tables and stack. So mwait() would resume when the
|
|
|
|
* monitor cache line is written to and then the CPU goes
|
|
|
|
* south due to overwritten text, page tables and stack.
|
|
|
|
*
|
|
|
|
* Note: This does _NOT_ protect against a stray MCE, NMI,
|
|
|
|
* SMI. They will resume execution at the instruction
|
|
|
|
* following the HLT instruction and run into the problem
|
|
|
|
* which this is trying to prevent.
|
|
|
|
*/
|
|
|
|
WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
|
|
|
|
while(1)
|
|
|
|
native_halt();
|
|
|
|
}
|
2010-09-18 06:39:11 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
x86/smp: Cure kexec() vs. mwait_play_dead() breakage
TLDR: It's a mess.
When kexec() is executed on a system with offline CPUs, which are parked in
mwait_play_dead() it can end up in a triple fault during the bootup of the
kexec kernel or cause hard to diagnose data corruption.
The reason is that kexec() eventually overwrites the previous kernel's text,
page tables, data and stack. If it writes to the cache line which is
monitored by a previously offlined CPU, MWAIT resumes execution and ends
up executing the wrong text, dereferencing overwritten page tables or
corrupting the kexec kernels data.
Cure this by bringing the offlined CPUs out of MWAIT into HLT.
Write to the monitored cache line of each offline CPU, which makes MWAIT
resume execution. The written control word tells the offlined CPUs to issue
HLT, which does not have the MWAIT problem.
That does not help, if a stray NMI, MCE or SMI hits the offlined CPUs as
those make it come out of HLT.
A follow up change will put them into INIT, which protects at least against
NMI and SMI.
Fixes: ea53069231f9 ("x86, hotplug: Use mwait to offline a processor, fix the legacy case")
Reported-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Ashok Raj <ashok.raj@intel.com>
Reviewed-by: Ashok Raj <ashok.raj@intel.com>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20230615193330.492257119@linutronix.de
2023-06-16 04:33:57 +08:00
|
|
|
/*
|
|
|
|
* Kick all "offline" CPUs out of mwait on kexec(). See comment in
|
|
|
|
* mwait_play_dead().
|
|
|
|
*/
|
|
|
|
void smp_kick_mwait_play_dead(void)
|
|
|
|
{
|
|
|
|
u32 newstate = CPUDEAD_MWAIT_KEXEC_HLT;
|
|
|
|
struct mwait_cpu_dead *md;
|
|
|
|
unsigned int cpu, i;
|
|
|
|
|
|
|
|
for_each_cpu_andnot(cpu, cpu_present_mask, cpu_online_mask) {
|
|
|
|
md = per_cpu_ptr(&mwait_cpu_dead, cpu);
|
|
|
|
|
|
|
|
/* Does it sit in mwait_play_dead() ? */
|
|
|
|
if (READ_ONCE(md->status) != CPUDEAD_MWAIT_WAIT)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* Wait up to 5ms */
|
|
|
|
for (i = 0; READ_ONCE(md->status) != newstate && i < 1000; i++) {
|
|
|
|
/* Bring it out of mwait */
|
|
|
|
WRITE_ONCE(md->control, newstate);
|
|
|
|
udelay(5);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (READ_ONCE(md->status) != newstate)
|
|
|
|
pr_err_once("CPU%u is stuck in mwait_play_dead()\n", cpu);
|
2010-09-18 06:39:11 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-04-13 07:49:39 +08:00
|
|
|
void __noreturn hlt_play_dead(void)
|
2010-09-18 06:39:11 +08:00
|
|
|
{
|
2010-12-18 23:30:05 +08:00
|
|
|
if (__this_cpu_read(cpu_info.x86) >= 4)
|
2010-09-18 08:06:46 +08:00
|
|
|
wbinvd();
|
|
|
|
|
2023-05-13 05:07:06 +08:00
|
|
|
while (1)
|
2010-09-18 06:39:11 +08:00
|
|
|
native_halt();
|
|
|
|
}
|
|
|
|
|
2008-08-22 18:52:13 +08:00
|
|
|
void native_play_dead(void)
|
|
|
|
{
|
|
|
|
play_dead_common();
|
2009-07-01 10:31:07 +08:00
|
|
|
tboot_shutdown(TB_SHUTDOWN_WFS);
|
2010-09-18 06:39:11 +08:00
|
|
|
|
2023-01-28 08:37:51 +08:00
|
|
|
mwait_play_dead();
|
2012-03-14 02:55:09 +08:00
|
|
|
if (cpuidle_play_dead())
|
|
|
|
hlt_play_dead();
|
2008-08-22 18:52:13 +08:00
|
|
|
}
|
|
|
|
|
2008-03-04 01:13:07 +08:00
|
|
|
#else /* ... !CONFIG_HOTPLUG_CPU */
|
2008-08-22 18:52:11 +08:00
|
|
|
int native_cpu_disable(void)
|
2008-03-04 01:13:07 +08:00
|
|
|
{
|
|
|
|
return -ENOSYS;
|
|
|
|
}
|
|
|
|
|
2008-08-22 18:52:13 +08:00
|
|
|
void native_play_dead(void)
|
|
|
|
{
|
|
|
|
BUG();
|
|
|
|
}
|
|
|
|
|
2008-03-04 01:12:42 +08:00
|
|
|
#endif
|