From 03aa047ef2db4985e444af6ee1c1dd084ad9fb4c Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Fri, 9 Nov 2018 09:21:47 +0100 Subject: [PATCH 1/5] s390/early: improve machine detection Right now the early machine detection code check stsi 3.2.2 for "KVM" and set MACHINE_IS_VM if this is different. As the console detection uses diagnose 8 if MACHINE_IS_VM returns true this will crash Linux early for any non z/VM system that sets a different value than KVM. So instead of assuming z/VM, do not set any of MACHINE_IS_LPAR, MACHINE_IS_VM, or MACHINE_IS_KVM. CC: stable@vger.kernel.org Reviewed-by: Heiko Carstens Signed-off-by: Christian Borntraeger Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/early.c | 4 ++-- arch/s390/kernel/setup.c | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index af5c2b3f7065..a8c7789b246b 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -63,10 +63,10 @@ static noinline __init void detect_machine_type(void) if (stsi(vmms, 3, 2, 2) || !vmms->count) return; - /* Running under KVM? If not we assume z/VM */ + /* Detect known hypervisors */ if (!memcmp(vmms->vm[0].cpi, "\xd2\xe5\xd4", 3)) S390_lowcore.machine_flags |= MACHINE_FLAG_KVM; - else + else if (!memcmp(vmms->vm[0].cpi, "\xa9\x61\xe5\xd4", 4)) S390_lowcore.machine_flags |= MACHINE_FLAG_VM; } diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 72dd23ef771b..7ed90a759135 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -1006,6 +1006,8 @@ void __init setup_arch(char **cmdline_p) pr_info("Linux is running under KVM in 64-bit mode\n"); else if (MACHINE_IS_LPAR) pr_info("Linux is running natively in 64-bit mode\n"); + else + pr_info("Linux is running as a guest in 64-bit mode\n"); /* Have one command line that is parsed and saved in /proc/cmdline */ /* boot_command_line has been already set up in early.c */ From a38662084c8bdb829ff486468c7ea801c13fcc34 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 8 Jan 2019 12:44:57 +0100 Subject: [PATCH 2/5] s390/mm: always force a load of the primary ASCE on context switch The ASCE of an mm_struct can be modified after a task has been created, e.g. via crst_table_downgrade for a compat process. The active_mm logic to avoid the switch_mm call if the next task is a kernel thread can lead to a situation where switch_mm is called where 'prev == next' is true but 'prev->context.asce == next->context.asce' is not. This can lead to a situation where a CPU uses the outdated ASCE to run a task. The result can be a crash, endless loops and really subtle problem due to TLBs being created with an invalid ASCE. Cc: stable@kernel.org # v3.15+ Fixes: 53e857f30867 ("s390/mm,tlb: race of lazy TLB flush vs. recreation") Reported-by: Heiko Carstens Reviewed-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/mmu_context.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index ccbb53e22024..e4462202200d 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -90,8 +90,6 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, { int cpu = smp_processor_id(); - if (prev == next) - return; S390_lowcore.user_asce = next->context.asce; cpumask_set_cpu(cpu, &next->context.cpu_attach_mask); /* Clear previous user-ASCE from CR1 and CR7 */ @@ -103,7 +101,8 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, __ctl_load(S390_lowcore.vdso_asce, 7, 7); clear_cpu_flag(CIF_ASCE_SECONDARY); } - cpumask_clear_cpu(cpu, &prev->context.cpu_attach_mask); + if (prev != next) + cpumask_clear_cpu(cpu, &prev->context.cpu_attach_mask); } #define finish_arch_post_lock_switch finish_arch_post_lock_switch From b7cb707c373094ce4008d4a6ac9b6b366ec52da5 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Wed, 9 Jan 2019 13:00:03 +0100 Subject: [PATCH 3/5] s390/smp: fix CPU hotplug deadlock with CPU rescan smp_rescan_cpus() is called without the device_hotplug_lock, which can lead to a dedlock when a new CPU is found and immediately set online by a udev rule. This was observed on an older kernel version, where the cpu_hotplug_begin() loop was still present, and it resulted in hanging chcpu and systemd-udev processes. This specific deadlock will not show on current kernels. However, there may be other possible deadlocks, and since smp_rescan_cpus() can still trigger a CPU hotplug operation, the device_hotplug_lock should be held. For reference, this was the deadlock with the old cpu_hotplug_begin() loop: chcpu (rescan) systemd-udevd echo 1 > /sys/../rescan -> smp_rescan_cpus() -> (*) get_online_cpus() (increases refcount) -> smp_add_present_cpu() (new CPU found) -> register_cpu() -> device_add() -> udev "add" event triggered -----------> udev rule sets CPU online -> echo 1 > /sys/.../online -> lock_device_hotplug_sysfs() (this is missing in rescan path) -> device_online() -> (**) device_lock(new CPU dev) -> cpu_up() -> cpu_hotplug_begin() (loops until refcount == 0) -> deadlock with (*) -> bus_probe_device() -> device_attach() -> device_lock(new CPU dev) -> deadlock with (**) Fix this by taking the device_hotplug_lock in the CPU rescan path. Cc: Signed-off-by: Gerald Schaefer Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/smp.c | 4 ++++ drivers/s390/char/sclp_config.c | 2 ++ 2 files changed, 6 insertions(+) diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index f82b3d3c36e2..307a1c86ea21 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -1166,7 +1166,11 @@ static ssize_t __ref rescan_store(struct device *dev, { int rc; + rc = lock_device_hotplug_sysfs(); + if (rc) + return rc; rc = smp_rescan_cpus(); + unlock_device_hotplug(); return rc ? rc : count; } static DEVICE_ATTR_WO(rescan); diff --git a/drivers/s390/char/sclp_config.c b/drivers/s390/char/sclp_config.c index 194ffd5c8580..039b2074db7e 100644 --- a/drivers/s390/char/sclp_config.c +++ b/drivers/s390/char/sclp_config.c @@ -60,7 +60,9 @@ static void sclp_cpu_capability_notify(struct work_struct *work) static void __ref sclp_cpu_change_notify(struct work_struct *work) { + lock_device_hotplug(); smp_rescan_cpus(); + unlock_device_hotplug(); } static void sclp_conf_receiver_fn(struct evbuf_header *evbuf) From 190f056fba230abee80712eb810939ef9a8c462f Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Wed, 2 Jan 2019 13:43:22 +0100 Subject: [PATCH 4/5] s390/vdso: correct vdso mapping for compat tasks While "s390/vdso: avoid 64-bit vdso mapping for compat tasks" fixed 64-bit vdso mapping for compat tasks under gdb it introduced another problem. "compat_mm" flag is not inherited during fork and when 31-bit process forks a child (but does not perform exec) it ends up with 64-bit vdso. To address that, init_new_context (which is called during fork and exec) now initialize compat_mm based on thread TIF_31BIT flag. Later compat_mm is adjusted in arch_setup_additional_pages, which is called during exec. Fixes: d1befa65823e ("s390/vdso: avoid 64-bit vdso mapping for compat tasks") Reported-by: Stefan Liebler Reviewed-by: Heiko Carstens Cc: # v4.20+ Signed-off-by: Vasily Gorbik Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/mmu_context.h | 2 +- arch/s390/kernel/vdso.c | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index e4462202200d..8d04e6f3f796 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -25,7 +25,7 @@ static inline int init_new_context(struct task_struct *tsk, atomic_set(&mm->context.flush_count, 0); mm->context.gmap_asce = 0; mm->context.flush_mm = 0; - mm->context.compat_mm = 0; + mm->context.compat_mm = test_thread_flag(TIF_31BIT); #ifdef CONFIG_PGSTE mm->context.alloc_pgste = page_table_allocate_pgste || test_thread_flag(TIF_PGSTE) || diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index ebe748a9f472..4ff354887db4 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -224,10 +224,9 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) vdso_pages = vdso64_pages; #ifdef CONFIG_COMPAT - if (is_compat_task()) { + mm->context.compat_mm = is_compat_task(); + if (mm->context.compat_mm) vdso_pages = vdso32_pages; - mm->context.compat_mm = 1; - } #endif /* * vDSO has a problem and was disabled, just don't "enable" it for From 60f1bf29c0b2519989927cae640cd1f50f59dc7f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 11 Jan 2019 15:18:22 +0100 Subject: [PATCH 5/5] s390/smp: Fix calling smp_call_ipl_cpu() from ipl CPU When calling smp_call_ipl_cpu() from the IPL CPU, we will try to read from pcpu_devices->lowcore. However, due to prefixing, that will result in reading from absolute address 0 on that CPU. We have to go via the actual lowcore instead. This means that right now, we will read lc->nodat_stack == 0 and therfore work on a very wrong stack. This BUG essentially broke rebooting under QEMU TCG (which will report a low address protection exception). And checking under KVM, it is also broken under KVM. With 1 VCPU it can be easily triggered. :/# echo 1 > /proc/sys/kernel/sysrq :/# echo b > /proc/sysrq-trigger [ 28.476745] sysrq: SysRq : Resetting [ 28.476793] Kernel stack overflow. [ 28.476817] CPU: 0 PID: 424 Comm: sh Not tainted 5.0.0-rc1+ #13 [ 28.476820] Hardware name: IBM 2964 NE1 716 (KVM/Linux) [ 28.476826] Krnl PSW : 0400c00180000000 0000000000115c0c (pcpu_delegate+0x12c/0x140) [ 28.476861] R:0 T:1 IO:0 EX:0 Key:0 M:0 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3 [ 28.476863] Krnl GPRS: ffffffffffffffff 0000000000000000 000000000010dff8 0000000000000000 [ 28.476864] 0000000000000000 0000000000000000 0000000000ab7090 000003e0006efbf0 [ 28.476864] 000000000010dff8 0000000000000000 0000000000000000 0000000000000000 [ 28.476865] 000000007fffc000 0000000000730408 000003e0006efc58 0000000000000000 [ 28.476887] Krnl Code: 0000000000115bfe: 4170f000 la %r7,0(%r15) [ 28.476887] 0000000000115c02: 41f0a000 la %r15,0(%r10) [ 28.476887] #0000000000115c06: e370f0980024 stg %r7,152(%r15) [ 28.476887] >0000000000115c0c: c0e5fffff86e brasl %r14,114ce8 [ 28.476887] 0000000000115c12: 41f07000 la %r15,0(%r7) [ 28.476887] 0000000000115c16: a7f4ffa8 brc 15,115b66 [ 28.476887] 0000000000115c1a: 0707 bcr 0,%r7 [ 28.476887] 0000000000115c1c: 0707 bcr 0,%r7 [ 28.476901] Call Trace: [ 28.476902] Last Breaking-Event-Address: [ 28.476920] [<0000000000a01c4a>] arch_call_rest_init+0x22/0x80 [ 28.476927] Kernel panic - not syncing: Corrupt kernel stack, can't continue. [ 28.476930] CPU: 0 PID: 424 Comm: sh Not tainted 5.0.0-rc1+ #13 [ 28.476932] Hardware name: IBM 2964 NE1 716 (KVM/Linux) [ 28.476932] Call Trace: Fixes: 2f859d0dad81 ("s390/smp: reduce size of struct pcpu") Cc: stable@vger.kernel.org # 4.0+ Reported-by: Cornelia Huck Signed-off-by: David Hildenbrand Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/smp.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 307a1c86ea21..b198ece2aad6 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -381,8 +381,13 @@ void smp_call_online_cpu(void (*func)(void *), void *data) */ void smp_call_ipl_cpu(void (*func)(void *), void *data) { + struct lowcore *lc = pcpu_devices->lowcore; + + if (pcpu_devices[0].address == stap()) + lc = &S390_lowcore; + pcpu_delegate(&pcpu_devices[0], func, data, - pcpu_devices->lowcore->nodat_stack); + lc->nodat_stack); } int smp_find_processor_id(u16 address)