From 820109fb11f24baf16ed86e232939877df3e9f0c Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 18 Aug 2022 22:59:47 +0200 Subject: [PATCH 01/17] s390: move from strlcpy with unused retval to strscpy Follow the advice of the below link and prefer 'strscpy' in this subsystem. Conversion is 1:1 because the return value is not used. Generated by a coccinelle script. Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/ Signed-off-by: Wolfram Sang Acked-by: Jakub Kicinski Acked-by: Benjamin Block Acked-by: Alexandra Winter Link: https://lore.kernel.org/r/20220818205948.6360-1-wsa+renesas@sang-engineering.com Link: https://lore.kernel.org/r/20220818210102.7301-1-wsa+renesas@sang-engineering.com [gor@linux.ibm.com: squashed two changes linked above together] Signed-off-by: Vasily Gorbik --- arch/s390/kernel/debug.c | 2 +- arch/s390/kernel/early.c | 2 +- drivers/s390/block/dasd_devmap.c | 2 +- drivers/s390/block/dasd_eer.c | 4 ++-- drivers/s390/block/dcssblk.c | 2 +- drivers/s390/char/hmcdrv_cache.c | 2 +- drivers/s390/char/tape_class.c | 4 ++-- drivers/s390/cio/qdio_debug.c | 2 +- drivers/s390/net/ctcm_main.c | 2 +- drivers/s390/net/fsm.c | 2 +- drivers/s390/net/qeth_ethtool.c | 4 ++-- drivers/s390/scsi/zfcp_aux.c | 2 +- drivers/s390/scsi/zfcp_fc.c | 2 +- 13 files changed, 16 insertions(+), 16 deletions(-) diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index 4331c7e6e1c0..d7a82066a638 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -250,7 +250,7 @@ static debug_info_t *debug_info_alloc(const char *name, int pages_per_area, rc->level = level; rc->buf_size = buf_size; rc->entry_size = sizeof(debug_entry_t) + buf_size; - strlcpy(rc->name, name, sizeof(rc->name)); + strscpy(rc->name, name, sizeof(rc->name)); memset(rc->views, 0, DEBUG_MAX_VIEWS * sizeof(struct debug_view *)); memset(rc->debugfs_entries, 0, DEBUG_MAX_VIEWS * sizeof(struct dentry *)); refcount_set(&(rc->ref_count), 0); diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index 432c8c987256..6030fdd6997b 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -267,7 +267,7 @@ char __bootdata(early_command_line)[COMMAND_LINE_SIZE]; static void __init setup_boot_command_line(void) { /* copy arch command line */ - strlcpy(boot_command_line, early_command_line, COMMAND_LINE_SIZE); + strscpy(boot_command_line, early_command_line, COMMAND_LINE_SIZE); } static void __init check_image_bootable(void) diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c index 811e79c9f59c..0062a70f0128 100644 --- a/drivers/s390/block/dasd_devmap.c +++ b/drivers/s390/block/dasd_devmap.c @@ -426,7 +426,7 @@ dasd_add_busid(const char *bus_id, int features) if (!devmap) { /* This bus_id is new. */ new->devindex = dasd_max_devindex++; - strlcpy(new->bus_id, bus_id, DASD_BUS_ID_SIZE); + strscpy(new->bus_id, bus_id, DASD_BUS_ID_SIZE); new->features = features; new->device = NULL; list_add(&new->list, &dasd_hashlists[hash]); diff --git a/drivers/s390/block/dasd_eer.c b/drivers/s390/block/dasd_eer.c index 5ae64af9ccea..d4d31cd11d26 100644 --- a/drivers/s390/block/dasd_eer.c +++ b/drivers/s390/block/dasd_eer.c @@ -313,7 +313,7 @@ static void dasd_eer_write_standard_trigger(struct dasd_device *device, ktime_get_real_ts64(&ts); header.tv_sec = ts.tv_sec; header.tv_usec = ts.tv_nsec / NSEC_PER_USEC; - strlcpy(header.busid, dev_name(&device->cdev->dev), + strscpy(header.busid, dev_name(&device->cdev->dev), DASD_EER_BUSID_SIZE); spin_lock_irqsave(&bufferlock, flags); @@ -356,7 +356,7 @@ static void dasd_eer_write_snss_trigger(struct dasd_device *device, ktime_get_real_ts64(&ts); header.tv_sec = ts.tv_sec; header.tv_usec = ts.tv_nsec / NSEC_PER_USEC; - strlcpy(header.busid, dev_name(&device->cdev->dev), + strscpy(header.busid, dev_name(&device->cdev->dev), DASD_EER_BUSID_SIZE); spin_lock_irqsave(&bufferlock, flags); diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 5187705bd0f3..93b80da60277 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -614,7 +614,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char rc = -ENAMETOOLONG; goto seg_list_del; } - strlcpy(local_buf, buf, i + 1); + strscpy(local_buf, buf, i + 1); dev_info->num_of_segments = num_of_segments; rc = dcssblk_is_continuous(dev_info); if (rc < 0) diff --git a/drivers/s390/char/hmcdrv_cache.c b/drivers/s390/char/hmcdrv_cache.c index 1f5bdb237862..43df27ceec11 100644 --- a/drivers/s390/char/hmcdrv_cache.c +++ b/drivers/s390/char/hmcdrv_cache.c @@ -154,7 +154,7 @@ static ssize_t hmcdrv_cache_do(const struct hmcdrv_ftp_cmdspec *ftp, /* cache some file info (FTP command, file name and file * size) unconditionally */ - strlcpy(hmcdrv_cache_file.fname, ftp->fname, + strscpy(hmcdrv_cache_file.fname, ftp->fname, HMCDRV_FTP_FIDENT_MAX); hmcdrv_cache_file.id = ftp->id; pr_debug("caching cmd %d, file size %zu for '%s'\n", diff --git a/drivers/s390/char/tape_class.c b/drivers/s390/char/tape_class.c index b58df0dd0039..c21dc68e05a0 100644 --- a/drivers/s390/char/tape_class.c +++ b/drivers/s390/char/tape_class.c @@ -54,10 +54,10 @@ struct tape_class_device *register_tape_dev( if (!tcd) return ERR_PTR(-ENOMEM); - strlcpy(tcd->device_name, device_name, TAPECLASS_NAME_LEN); + strscpy(tcd->device_name, device_name, TAPECLASS_NAME_LEN); for (s = strchr(tcd->device_name, '/'); s; s = strchr(s, '/')) *s = '!'; - strlcpy(tcd->mode_name, mode_name, TAPECLASS_NAME_LEN); + strscpy(tcd->mode_name, mode_name, TAPECLASS_NAME_LEN); for (s = strchr(tcd->mode_name, '/'); s; s = strchr(s, '/')) *s = '!'; diff --git a/drivers/s390/cio/qdio_debug.c b/drivers/s390/cio/qdio_debug.c index 4bb7965daa0f..1a9714af51e4 100644 --- a/drivers/s390/cio/qdio_debug.c +++ b/drivers/s390/cio/qdio_debug.c @@ -87,7 +87,7 @@ int qdio_allocate_dbf(struct qdio_irq *irq_ptr) debug_unregister(irq_ptr->debug_area); return -ENOMEM; } - strlcpy(new_entry->dbf_name, text, QDIO_DBF_NAME_LEN); + strscpy(new_entry->dbf_name, text, QDIO_DBF_NAME_LEN); new_entry->dbf_info = irq_ptr->debug_area; mutex_lock(&qdio_dbf_list_mutex); list_add(&new_entry->dbf_list, &qdio_dbf_list); diff --git a/drivers/s390/net/ctcm_main.c b/drivers/s390/net/ctcm_main.c index e0fdd54bfeb7..37b551bd43bf 100644 --- a/drivers/s390/net/ctcm_main.c +++ b/drivers/s390/net/ctcm_main.c @@ -1566,7 +1566,7 @@ static int ctcm_new_device(struct ccwgroup_device *cgdev) goto out_dev; } - strlcpy(priv->fsm->name, dev->name, sizeof(priv->fsm->name)); + strscpy(priv->fsm->name, dev->name, sizeof(priv->fsm->name)); dev_info(&dev->dev, "setup OK : r/w = %s/%s, protocol : %d\n", diff --git a/drivers/s390/net/fsm.c b/drivers/s390/net/fsm.c index 98c4864932d2..0ff61d00feb1 100644 --- a/drivers/s390/net/fsm.c +++ b/drivers/s390/net/fsm.c @@ -28,7 +28,7 @@ init_fsm(char *name, const char **state_names, const char **event_names, int nr_ "fsm(%s): init_fsm: Couldn't alloc instance\n", name); return NULL; } - strlcpy(this->name, name, sizeof(this->name)); + strscpy(this->name, name, sizeof(this->name)); init_waitqueue_head(&this->wait_q); f = kzalloc(sizeof(fsm), order); diff --git a/drivers/s390/net/qeth_ethtool.c b/drivers/s390/net/qeth_ethtool.c index 9eba0a32e9f9..e250f49535fa 100644 --- a/drivers/s390/net/qeth_ethtool.c +++ b/drivers/s390/net/qeth_ethtool.c @@ -188,9 +188,9 @@ static void qeth_get_drvinfo(struct net_device *dev, { struct qeth_card *card = dev->ml_priv; - strlcpy(info->driver, IS_LAYER2(card) ? "qeth_l2" : "qeth_l3", + strscpy(info->driver, IS_LAYER2(card) ? "qeth_l2" : "qeth_l3", sizeof(info->driver)); - strlcpy(info->fw_version, card->info.mcl_level, + strscpy(info->fw_version, card->info.mcl_level, sizeof(info->fw_version)); snprintf(info->bus_info, sizeof(info->bus_info), "%s/%s/%s", CARD_RDEV_ID(card), CARD_WDEV_ID(card), CARD_DDEV_ID(card)); diff --git a/drivers/s390/scsi/zfcp_aux.c b/drivers/s390/scsi/zfcp_aux.c index fd2f1c31bd21..df782646e856 100644 --- a/drivers/s390/scsi/zfcp_aux.c +++ b/drivers/s390/scsi/zfcp_aux.c @@ -103,7 +103,7 @@ static void __init zfcp_init_device_setup(char *devstr) token = strsep(&str, ","); if (!token || strlen(token) >= ZFCP_BUS_ID_SIZE) goto err_out; - strlcpy(busid, token, ZFCP_BUS_ID_SIZE); + strscpy(busid, token, ZFCP_BUS_ID_SIZE); token = strsep(&str, ","); if (!token || kstrtoull(token, 0, (unsigned long long *) &wwpn)) diff --git a/drivers/s390/scsi/zfcp_fc.c b/drivers/s390/scsi/zfcp_fc.c index b61acbb09be3..77917b339870 100644 --- a/drivers/s390/scsi/zfcp_fc.c +++ b/drivers/s390/scsi/zfcp_fc.c @@ -885,7 +885,7 @@ static int zfcp_fc_gspn(struct zfcp_adapter *adapter, dev_name(&adapter->ccw_device->dev), init_utsname()->nodename); else - strlcpy(fc_host_symbolic_name(adapter->scsi_host), + strscpy(fc_host_symbolic_name(adapter->scsi_host), gspn_rsp->gspn.fp_name, FC_SYMBOLIC_NAME_SIZE); return 0; From 9aa10e791c2b722cf166f46424a6a8364164fe12 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 29 Aug 2022 14:17:28 +0200 Subject: [PATCH 02/17] s390/delay: sync comment within __delay() with reality The comment within __delay() is outdated and does not reflect anymore what the function is doing. Therefore replace the comment. Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/lib/delay.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/s390/lib/delay.c b/arch/s390/lib/delay.c index f7f5adea8940..be14c58cb989 100644 --- a/arch/s390/lib/delay.c +++ b/arch/s390/lib/delay.c @@ -13,13 +13,10 @@ void __delay(unsigned long loops) { - /* - * To end the bloody studid and useless discussion about the - * BogoMips number I took the liberty to define the __delay - * function in a way that that resulting BogoMips number will - * yield the megahertz number of the cpu. The important function - * is udelay and that is done using the tod clock. -- martin. - */ + /* + * Loop 'loops' times. Callers must not assume a specific + * amount of time passes before this function returns. + */ asm volatile("0: brct %0,0b" : : "d" ((loops/2) + 1)); } EXPORT_SYMBOL(__delay); From bf2ce3855c7d446669f89999d523d91048b0daf9 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 29 Aug 2022 14:26:56 +0200 Subject: [PATCH 03/17] s390/mm: remove unused access parameter from do_fault_error() Remove unused access parameter from do_fault_error() which also makes the code a bit more readable since quite some callers can be simplified. Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/mm/fault.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 09b6e756d521..5a351042b1c6 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -268,8 +268,7 @@ static noinline void do_sigbus(struct pt_regs *regs) (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK)); } -static noinline void do_fault_error(struct pt_regs *regs, int access, - vm_fault_t fault) +static noinline void do_fault_error(struct pt_regs *regs, vm_fault_t fault) { int si_code; @@ -518,7 +517,7 @@ void do_protection_exception(struct pt_regs *regs) fault = do_exception(regs, access); } if (unlikely(fault)) - do_fault_error(regs, access, fault); + do_fault_error(regs, fault); } NOKPROBE_SYMBOL(do_protection_exception); @@ -530,7 +529,7 @@ void do_dat_exception(struct pt_regs *regs) access = VM_ACCESS_FLAGS; fault = do_exception(regs, access); if (unlikely(fault)) - do_fault_error(regs, access, fault); + do_fault_error(regs, fault); } NOKPROBE_SYMBOL(do_dat_exception); @@ -805,7 +804,7 @@ void do_secure_storage_access(struct pt_regs *regs) addr = __gmap_translate(gmap, addr); mmap_read_unlock(mm); if (IS_ERR_VALUE(addr)) { - do_fault_error(regs, VM_ACCESS_FLAGS, VM_FAULT_BADMAP); + do_fault_error(regs, VM_FAULT_BADMAP); break; } fallthrough; @@ -815,7 +814,7 @@ void do_secure_storage_access(struct pt_regs *regs) vma = find_vma(mm, addr); if (!vma) { mmap_read_unlock(mm); - do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP); + do_fault_error(regs, VM_FAULT_BADMAP); break; } page = follow_page(vma, addr, FOLL_WRITE | FOLL_GET); @@ -838,7 +837,7 @@ void do_secure_storage_access(struct pt_regs *regs) BUG(); break; default: - do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP); + do_fault_error(regs, VM_FAULT_BADMAP); WARN_ON_ONCE(1); } } @@ -850,7 +849,7 @@ void do_non_secure_storage_access(struct pt_regs *regs) struct gmap *gmap = (struct gmap *)S390_lowcore.gmap; if (get_fault_type(regs) != GMAP_FAULT) { - do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP); + do_fault_error(regs, VM_FAULT_BADMAP); WARN_ON_ONCE(1); return; } From b193d2d4d01ecf211c935bcb7409a69768851c6e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 5 Sep 2022 14:18:27 +0200 Subject: [PATCH 04/17] s390/mm: split lowcore pages with set_memory_4k() Use set_memory_4k() to split lowcore pages within the kernel mapping instead of using the quite subtle !addr check within modify_pmd_table() and modify_pud_table() to prevent large pages for address zero. With this lowcore might be mapped with 1MB / 2GB frames and only later will be split. This way this mapping is handled like every other. Reviewed-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/mm/vmem.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index c2583f921ca8..1c86de9dd87f 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -240,7 +240,7 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, } else if (pmd_none(*pmd)) { if (IS_ALIGNED(addr, PMD_SIZE) && IS_ALIGNED(next, PMD_SIZE) && - MACHINE_HAS_EDAT1 && addr && direct && + MACHINE_HAS_EDAT1 && direct && !debug_pagealloc_enabled()) { set_pmd(pmd, __pmd(__pa(addr) | prot)); pages++; @@ -336,7 +336,7 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, } else if (pud_none(*pud)) { if (IS_ALIGNED(addr, PUD_SIZE) && IS_ALIGNED(next, PUD_SIZE) && - MACHINE_HAS_EDAT2 && addr && direct && + MACHINE_HAS_EDAT2 && direct && !debug_pagealloc_enabled()) { set_pud(pud, __pud(__pa(addr) | prot)); pages++; @@ -584,6 +584,9 @@ void __init vmem_map_init(void) __set_memory(__stext_amode31, (__etext_amode31 - __stext_amode31) >> PAGE_SHIFT, SET_MEMORY_RO | SET_MEMORY_X); + /* lowcore requires 4k mapping for real addresses / prefixing */ + set_memory_4k(0, LC_PAGES); + /* lowcore must be executable for LPSWE */ if (!static_key_enabled(&cpu_has_bear)) set_memory_x(0, 1); From edcfc9c71bfd491e36ae93a3ea633fb55678f51f Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 5 Sep 2022 14:36:19 +0200 Subject: [PATCH 05/17] s390/ptdump: add missing amode31 markers Add amode31 markers which makes a ro mapping in the middle of nowhere in the kernel_page_tables output less magic. Reviewed-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/mm/dump_pagetables.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c index 9f9af5298dd6..4250861f90f5 100644 --- a/arch/s390/mm/dump_pagetables.c +++ b/arch/s390/mm/dump_pagetables.c @@ -21,6 +21,8 @@ struct addr_marker { enum address_markers_idx { IDENTITY_BEFORE_NR = 0, IDENTITY_BEFORE_END_NR, + AMODE31_START_NR, + AMODE31_END_NR, KERNEL_START_NR, KERNEL_END_NR, #ifdef CONFIG_KFENCE @@ -44,6 +46,8 @@ enum address_markers_idx { static struct addr_marker address_markers[] = { [IDENTITY_BEFORE_NR] = {0, "Identity Mapping Start"}, [IDENTITY_BEFORE_END_NR] = {(unsigned long)_stext, "Identity Mapping End"}, + [AMODE31_START_NR] = {0, "Amode31 Area Start"}, + [AMODE31_END_NR] = {0, "Amode31 Area End"}, [KERNEL_START_NR] = {(unsigned long)_stext, "Kernel Image Start"}, [KERNEL_END_NR] = {(unsigned long)_end, "Kernel Image End"}, #ifdef CONFIG_KFENCE @@ -276,6 +280,8 @@ static int pt_dump_init(void) max_addr = (S390_lowcore.kernel_asce & _REGION_ENTRY_TYPE_MASK) >> 2; max_addr = 1UL << (max_addr * 11 + 31); address_markers[IDENTITY_AFTER_END_NR].start_address = ident_map_size; + address_markers[AMODE31_START_NR].start_address = __samode31; + address_markers[AMODE31_END_NR].start_address = __eamode31; address_markers[MODULES_NR].start_address = MODULES_VADDR; address_markers[MODULES_END_NR].start_address = MODULES_END; address_markers[VMEMMAP_NR].start_address = (unsigned long) vmemmap; From 6cbd7cc2ebbe074522246f50628cbae34915bb95 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 1 Sep 2022 10:33:51 +0200 Subject: [PATCH 06/17] s390/smp: call smp_reinit_ipl_cpu() before scheduler is available Currently smp_reinit_ipl_cpu() is a pre-SMP early initcall. That ensures no CPU is running in parallel, but still not enough to assume the code is exclusive, since the scheduling is already available. Move the function call to arch_call_rest_init() callback to ensure no thread could be preempted and allow lockless allocation of the kernel page tables. That is needed to allow a follow-up rework of the absolute lowcore access mechanism. Suggested-by: Heiko Carstens Signed-off-by: Alexander Gordeev Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/smp.h | 1 + arch/s390/kernel/setup.c | 1 + arch/s390/kernel/smp.c | 3 +-- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h index 7f5d4763357b..59cd27255f38 100644 --- a/arch/s390/include/asm/smp.h +++ b/arch/s390/include/asm/smp.h @@ -58,6 +58,7 @@ static inline void smp_cpus_done(unsigned int max_cpus) { } +extern int smp_reinit_ipl_cpu(void); extern int smp_rescan_cpus(void); extern void __noreturn cpu_die(void); extern void __cpu_die(unsigned int cpu); diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index bbd4bde4f65d..063f0512a64a 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -395,6 +395,7 @@ void __init arch_call_rest_init(void) { unsigned long stack; + smp_reinit_ipl_cpu(); stack = stack_alloc(); if (!stack) panic("Couldn't allocate kernel stack"); diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 30c91d565933..0e8e5546933f 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -1256,7 +1256,7 @@ static __always_inline void set_new_lowcore(struct lowcore *lc) : "memory", "cc"); } -static int __init smp_reinit_ipl_cpu(void) +int __init smp_reinit_ipl_cpu(void) { unsigned long async_stack, nodat_stack, mcck_stack; struct lowcore *lc, *lc_ipl; @@ -1291,4 +1291,3 @@ static int __init smp_reinit_ipl_cpu(void) return 0; } -early_initcall(smp_reinit_ipl_cpu); From 4df29d2b9024d6ababc6342cf5f721cbaff517b5 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 20 Jul 2022 08:22:01 +0200 Subject: [PATCH 07/17] s390/smp: rework absolute lowcore access Temporary unsetting of the prefix page in memcpy_absolute() routine poses a risk of executing code path with unexpectedly disabled prefix page. This rework avoids the prefix page uninstalling and disabling of normal and machine check interrupts when accessing the absolute zero memory. Although memcpy_absolute() routine can access the whole memory, it is only used to update the absolute zero lowcore. This rework therefore introduces a new mechanism for the absolute zero lowcore access and scraps memcpy_absolute() routine for good. Instead, an area is reserved in the virtual memory that is used for the absolute lowcore access only. That area holds an array of 8KB virtual mappings - one per CPU. Whenever a CPU is brought online, the corresponding item is mapped to the real address of the previously installed prefix page. The absolute zero lowcore access works like this: a CPU calls the new primitive get_abs_lowcore() to obtain its 8KB mapping as a pointer to the struct lowcore. Virtual address references to that pointer get translated to the real addresses of the prefix page, which in turn gets swapped with the absolute zero memory addresses due to prefixing. Once the pointer is not needed it must be released with put_abs_lowcore() primitive: struct lowcore *abs_lc; unsigned long flags; abs_lc = get_abs_lowcore(&flags); abs_lc->... = ...; put_abs_lowcore(abs_lc, flags); To ensure the described mechanism works large segment- and region- table entries must be avoided for the 8KB mappings. Failure to do so results in usage of Region-Frame Absolute Address (RFAA) or Segment-Frame Absolute Address (SFAA) large page fields. In that case absolute addresses would be used to address the prefix page instead of the real ones and the prefixing would get bypassed. Reviewed-by: Heiko Carstens Signed-off-by: Alexander Gordeev Signed-off-by: Vasily Gorbik --- arch/s390/boot/startup.c | 5 +- arch/s390/include/asm/abs_lowcore.h | 17 +++++ arch/s390/include/asm/pgtable.h | 3 + arch/s390/include/asm/processor.h | 15 ----- arch/s390/kernel/Makefile | 2 +- arch/s390/kernel/abs_lowcore.c | 95 ++++++++++++++++++++++++++++ arch/s390/kernel/ipl.c | 9 ++- arch/s390/kernel/machine_kexec.c | 8 ++- arch/s390/kernel/os_info.c | 9 ++- arch/s390/kernel/setup.c | 36 ++++++----- arch/s390/kernel/smp.c | 34 +++++++--- arch/s390/mm/init.c | 2 +- arch/s390/mm/maccess.c | 67 +++++++++----------- arch/s390/mm/vmem.c | 97 +++++++++++++++++++++++++++++ 14 files changed, 315 insertions(+), 84 deletions(-) create mode 100644 arch/s390/include/asm/abs_lowcore.h create mode 100644 arch/s390/kernel/abs_lowcore.c diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index bc48fe82d949..41b7af7a9365 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -10,11 +10,13 @@ #include #include #include +#include #include "decompressor.h" #include "boot.h" #include "uv.h" unsigned long __bootdata_preserved(__kaslr_offset); +unsigned long __bootdata_preserved(__abs_lowcore); unsigned long __bootdata(__amode31_base); unsigned long __bootdata_preserved(VMALLOC_START); unsigned long __bootdata_preserved(VMALLOC_END); @@ -180,7 +182,8 @@ static void setup_kernel_memory_layout(void) /* force vmalloc and modules below kasan shadow */ vmax = min(vmax, KASAN_SHADOW_START); #endif - MODULES_END = vmax; + __abs_lowcore = round_down(vmax - ABS_LOWCORE_MAP_SIZE, sizeof(struct lowcore)); + MODULES_END = round_down(__abs_lowcore, _SEGMENT_SIZE); MODULES_VADDR = MODULES_END - MODULES_LEN; VMALLOC_END = MODULES_VADDR; diff --git a/arch/s390/include/asm/abs_lowcore.h b/arch/s390/include/asm/abs_lowcore.h new file mode 100644 index 000000000000..4c61b14ee928 --- /dev/null +++ b/arch/s390/include/asm/abs_lowcore.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_ABS_LOWCORE_H +#define _ASM_S390_ABS_LOWCORE_H + +#include + +#define ABS_LOWCORE_MAP_SIZE (NR_CPUS * sizeof(struct lowcore)) + +extern unsigned long __abs_lowcore; +extern bool abs_lowcore_mapped; + +struct lowcore *get_abs_lowcore(unsigned long *flags); +void put_abs_lowcore(struct lowcore *lc, unsigned long flags); +int abs_lowcore_map(int cpu, struct lowcore *lc, bool alloc); +void abs_lowcore_unmap(int cpu); + +#endif /* _ASM_S390_ABS_LOWCORE_H */ diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index f019df19884d..45617c966202 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1777,6 +1777,9 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset) extern int vmem_add_mapping(unsigned long start, unsigned long size); extern void vmem_remove_mapping(unsigned long start, unsigned long size); +extern int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc); +extern int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot); +extern void vmem_unmap_4k_page(unsigned long addr); extern int s390_enable_sie(void); extern int s390_enable_skey(void); extern void s390_reset_cmma(struct mm_struct *mm); diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index bd66f8e34949..93677ae89e7e 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -307,21 +307,6 @@ static __always_inline void __noreturn disabled_wait(void) #define ARCH_LOW_ADDRESS_LIMIT 0x7fffffffUL extern int memcpy_real(void *, unsigned long, size_t); -extern void memcpy_absolute(void *, void *, size_t); - -#define put_abs_lowcore(member, x) do { \ - unsigned long __abs_address = offsetof(struct lowcore, member); \ - __typeof__(((struct lowcore *)0)->member) __tmp = (x); \ - \ - memcpy_absolute(__va(__abs_address), &__tmp, sizeof(__tmp)); \ -} while (0) - -#define get_abs_lowcore(x, member) do { \ - unsigned long __abs_address = offsetof(struct lowcore, member); \ - __typeof__(((struct lowcore *)0)->member) *__ptr = &(x); \ - \ - memcpy_absolute(__ptr, __va(__abs_address), sizeof(*__ptr)); \ -} while (0) extern int s390_isolate_bp(void); extern int s390_isolate_bp_guest(void); diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 3cbfa9fddd9a..45e4b2f41e05 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -40,7 +40,7 @@ obj-y += sysinfo.o lgr.o os_info.o machine_kexec.o obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o obj-y += entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o obj-y += nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o -obj-y += smp.o text_amode31.o stacktrace.o +obj-y += smp.o text_amode31.o stacktrace.o abs_lowcore.o extra-y += head64.o vmlinux.lds diff --git a/arch/s390/kernel/abs_lowcore.c b/arch/s390/kernel/abs_lowcore.c new file mode 100644 index 000000000000..fb92e8ed0525 --- /dev/null +++ b/arch/s390/kernel/abs_lowcore.c @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +#define ABS_LOWCORE_UNMAPPED 1 +#define ABS_LOWCORE_LAP_ON 2 +#define ABS_LOWCORE_IRQS_ON 4 + +unsigned long __bootdata_preserved(__abs_lowcore); +bool __ro_after_init abs_lowcore_mapped; + +int abs_lowcore_map(int cpu, struct lowcore *lc, bool alloc) +{ + unsigned long addr = __abs_lowcore + (cpu * sizeof(struct lowcore)); + unsigned long phys = __pa(lc); + int rc, i; + + for (i = 0; i < LC_PAGES; i++) { + rc = __vmem_map_4k_page(addr, phys, PAGE_KERNEL, alloc); + if (rc) { + /* + * Do not unmap allocated page tables in case the + * allocation was not requested. In such a case the + * request is expected coming from an atomic context, + * while the unmap attempt might sleep. + */ + if (alloc) { + for (--i; i >= 0; i--) { + addr -= PAGE_SIZE; + vmem_unmap_4k_page(addr); + } + } + return rc; + } + addr += PAGE_SIZE; + phys += PAGE_SIZE; + } + return 0; +} + +void abs_lowcore_unmap(int cpu) +{ + unsigned long addr = __abs_lowcore + (cpu * sizeof(struct lowcore)); + int i; + + for (i = 0; i < LC_PAGES; i++) { + vmem_unmap_4k_page(addr); + addr += PAGE_SIZE; + } +} + +struct lowcore *get_abs_lowcore(unsigned long *flags) +{ + unsigned long irq_flags; + union ctlreg0 cr0; + int cpu; + + *flags = 0; + cpu = get_cpu(); + if (abs_lowcore_mapped) { + return ((struct lowcore *)__abs_lowcore) + cpu; + } else { + if (cpu != 0) + panic("Invalid unmapped absolute lowcore access\n"); + local_irq_save(irq_flags); + if (!irqs_disabled_flags(irq_flags)) + *flags |= ABS_LOWCORE_IRQS_ON; + __ctl_store(cr0.val, 0, 0); + if (cr0.lap) { + *flags |= ABS_LOWCORE_LAP_ON; + __ctl_clear_bit(0, 28); + } + *flags |= ABS_LOWCORE_UNMAPPED; + return lowcore_ptr[0]; + } +} + +void put_abs_lowcore(struct lowcore *lc, unsigned long flags) +{ + if (abs_lowcore_mapped) { + if (flags) + panic("Invalid mapped absolute lowcore release\n"); + } else { + if (smp_processor_id() != 0) + panic("Invalid mapped absolute lowcore access\n"); + if (!(flags & ABS_LOWCORE_UNMAPPED)) + panic("Invalid unmapped absolute lowcore release\n"); + if (flags & ABS_LOWCORE_LAP_ON) + __ctl_set_bit(0, 28); + if (flags & ABS_LOWCORE_IRQS_ON) + local_irq_enable(); + } + put_cpu(); +} diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 1cc85b8ff42e..325cbf69ebbd 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -1642,12 +1643,16 @@ static struct shutdown_action __refdata dump_action = { static void dump_reipl_run(struct shutdown_trigger *trigger) { unsigned long ipib = (unsigned long) reipl_block_actual; + struct lowcore *abs_lc; + unsigned long flags; unsigned int csum; csum = (__force unsigned int) csum_partial(reipl_block_actual, reipl_block_actual->hdr.len, 0); - put_abs_lowcore(ipib, ipib); - put_abs_lowcore(ipib_checksum, csum); + abs_lc = get_abs_lowcore(&flags); + abs_lc->ipib = ipib; + abs_lc->ipib_checksum = csum; + put_abs_lowcore(abs_lc, flags); dump_run(trigger); } diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index ab761c008f98..4579b42286d5 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -222,13 +223,18 @@ void machine_kexec_cleanup(struct kimage *image) void arch_crash_save_vmcoreinfo(void) { + struct lowcore *abs_lc; + unsigned long flags; + VMCOREINFO_SYMBOL(lowcore_ptr); VMCOREINFO_SYMBOL(high_memory); VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS); vmcoreinfo_append_str("SAMODE31=%lx\n", __samode31); vmcoreinfo_append_str("EAMODE31=%lx\n", __eamode31); vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); - put_abs_lowcore(vmcore_info, paddr_vmcoreinfo_note()); + abs_lc = get_abs_lowcore(&flags); + abs_lc->vmcore_info = paddr_vmcoreinfo_note(); + put_abs_lowcore(abs_lc, flags); } void machine_shutdown(void) diff --git a/arch/s390/kernel/os_info.c b/arch/s390/kernel/os_info.c index 1acc2e05d70f..506ccb74d2d0 100644 --- a/arch/s390/kernel/os_info.c +++ b/arch/s390/kernel/os_info.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include @@ -57,13 +57,16 @@ void os_info_entry_add(int nr, void *ptr, u64 size) */ void __init os_info_init(void) { - void *ptr = &os_info; + struct lowcore *abs_lc; + unsigned long flags; os_info.version_major = OS_INFO_VERSION_MAJOR; os_info.version_minor = OS_INFO_VERSION_MINOR; os_info.magic = OS_INFO_MAGIC; os_info.csum = os_info_csum(&os_info); - put_abs_lowcore(os_info, __pa(ptr)); + abs_lc = get_abs_lowcore(&flags); + abs_lc->os_info = __pa(&os_info); + put_abs_lowcore(abs_lc, flags); } #ifdef CONFIG_CRASH_DUMP diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 063f0512a64a..8d0a670c9f48 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -58,7 +58,7 @@ #include #include #include -#include +#include #include #include #include @@ -412,8 +412,9 @@ void __init arch_call_rest_init(void) static void __init setup_lowcore_dat_off(void) { unsigned long int_psw_mask = PSW_KERNEL_BITS; + struct lowcore *abs_lc, *lc; unsigned long mcck_stack; - struct lowcore *lc; + unsigned long flags; if (IS_ENABLED(CONFIG_KASAN)) int_psw_mask |= PSW_MASK_DAT; @@ -475,12 +476,14 @@ static void __init setup_lowcore_dat_off(void) lc->restart_data = 0; lc->restart_source = -1U; - put_abs_lowcore(restart_stack, lc->restart_stack); - put_abs_lowcore(restart_fn, lc->restart_fn); - put_abs_lowcore(restart_data, lc->restart_data); - put_abs_lowcore(restart_source, lc->restart_source); - put_abs_lowcore(restart_psw, lc->restart_psw); - put_abs_lowcore(mcesad, lc->mcesad); + abs_lc = get_abs_lowcore(&flags); + abs_lc->restart_stack = lc->restart_stack; + abs_lc->restart_fn = lc->restart_fn; + abs_lc->restart_data = lc->restart_data; + abs_lc->restart_source = lc->restart_source; + abs_lc->restart_psw = lc->restart_psw; + abs_lc->mcesad = lc->mcesad; + put_abs_lowcore(abs_lc, flags); mcck_stack = (unsigned long)memblock_alloc(THREAD_SIZE, THREAD_SIZE); if (!mcck_stack) @@ -501,8 +504,8 @@ static void __init setup_lowcore_dat_off(void) static void __init setup_lowcore_dat_on(void) { - struct lowcore *lc = lowcore_ptr[0]; - int cr; + struct lowcore *abs_lc; + unsigned long flags; __ctl_clear_bit(0, 28); S390_lowcore.external_new_psw.mask |= PSW_MASK_DAT; @@ -511,10 +514,15 @@ static void __init setup_lowcore_dat_on(void) S390_lowcore.io_new_psw.mask |= PSW_MASK_DAT; __ctl_set_bit(0, 28); __ctl_store(S390_lowcore.cregs_save_area, 0, 15); - put_abs_lowcore(restart_flags, RESTART_FLAG_CTLREGS); - put_abs_lowcore(program_new_psw, lc->program_new_psw); - for (cr = 0; cr < ARRAY_SIZE(lc->cregs_save_area); cr++) - put_abs_lowcore(cregs_save_area[cr], lc->cregs_save_area[cr]); + if (abs_lowcore_map(0, lowcore_ptr[0], true)) + panic("Couldn't setup absolute lowcore"); + abs_lowcore_mapped = true; + abs_lc = get_abs_lowcore(&flags); + abs_lc->restart_flags = RESTART_FLAG_CTLREGS; + abs_lc->program_new_psw = S390_lowcore.program_new_psw; + memcpy(abs_lc->cregs_save_area, S390_lowcore.cregs_save_area, + sizeof(abs_lc->cregs_save_area)); + put_abs_lowcore(abs_lc, flags); } static struct resource code_resource = { diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 0e8e5546933f..0f646b568d59 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -45,7 +45,7 @@ #include #include #include -#include +#include #include #include #include @@ -212,10 +212,14 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu) lc->preempt_count = PREEMPT_DISABLED; if (nmi_alloc_mcesa(&lc->mcesad)) goto out; + if (abs_lowcore_map(cpu, lc, true)) + goto out_mcesa; lowcore_ptr[cpu] = lc; pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, __pa(lc)); return 0; +out_mcesa: + nmi_free_mcesa(&lc->mcesad); out: stack_free(mcck_stack); stack_free(async_stack); @@ -237,6 +241,7 @@ static void pcpu_free_lowcore(struct pcpu *pcpu) mcck_stack = lc->mcck_stack - STACK_INIT_OFFSET; pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, 0); lowcore_ptr[cpu] = NULL; + abs_lowcore_unmap(cpu); nmi_free_mcesa(&lc->mcesad); stack_free(async_stack); stack_free(mcck_stack); @@ -315,9 +320,12 @@ static void pcpu_delegate(struct pcpu *pcpu, pcpu_delegate_fn *func, void *data, unsigned long stack) { - struct lowcore *lc = lowcore_ptr[pcpu - pcpu_devices]; - unsigned int source_cpu = stap(); + struct lowcore *lc, *abs_lc; + unsigned int source_cpu; + unsigned long flags; + lc = lowcore_ptr[pcpu - pcpu_devices]; + source_cpu = stap(); __load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT); if (pcpu->address == source_cpu) { call_on_stack(2, stack, void, __pcpu_delegate, @@ -332,10 +340,12 @@ static void pcpu_delegate(struct pcpu *pcpu, lc->restart_data = (unsigned long)data; lc->restart_source = source_cpu; } else { - put_abs_lowcore(restart_stack, stack); - put_abs_lowcore(restart_fn, (unsigned long)func); - put_abs_lowcore(restart_data, (unsigned long)data); - put_abs_lowcore(restart_source, source_cpu); + abs_lc = get_abs_lowcore(&flags); + abs_lc->restart_stack = stack; + abs_lc->restart_fn = (unsigned long)func; + abs_lc->restart_data = (unsigned long)data; + abs_lc->restart_source = source_cpu; + put_abs_lowcore(abs_lc, flags); } __bpon(); asm volatile( @@ -581,6 +591,8 @@ static DEFINE_SPINLOCK(ctl_lock); void smp_ctl_set_clear_bit(int cr, int bit, bool set) { struct ec_creg_mask_parms parms = { .cr = cr, }; + struct lowcore *abs_lc; + unsigned long flags; u64 ctlreg; if (set) { @@ -591,9 +603,11 @@ void smp_ctl_set_clear_bit(int cr, int bit, bool set) parms.andval = ~(1UL << bit); } spin_lock(&ctl_lock); - get_abs_lowcore(ctlreg, cregs_save_area[cr]); + abs_lc = get_abs_lowcore(&flags); + ctlreg = abs_lc->cregs_save_area[cr]; ctlreg = (ctlreg & parms.andval) | parms.orval; - put_abs_lowcore(cregs_save_area[cr], ctlreg); + abs_lc->cregs_save_area[cr] = ctlreg; + put_abs_lowcore(abs_lc, flags); spin_unlock(&ctl_lock); on_each_cpu(smp_ctl_bit_callback, &parms, 1); } @@ -1281,6 +1295,8 @@ int __init smp_reinit_ipl_cpu(void) __ctl_clear_bit(0, 28); /* disable lowcore protection */ S390_lowcore.mcesad = mcesad; __ctl_load(cr0, 0, 0); + if (abs_lowcore_map(0, lc, false)) + panic("Couldn't remap absolute lowcore"); lowcore_ptr[0] = lc; local_mcck_enable(); local_irq_restore(flags); diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 4a154a084966..97d66a3e60fb 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -37,7 +37,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c index d6d84e02f35a..b8451ddbb3d6 100644 --- a/arch/s390/mm/maccess.c +++ b/arch/s390/mm/maccess.c @@ -15,6 +15,7 @@ #include #include #include +#include #include static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t size) @@ -148,46 +149,20 @@ int memcpy_real(void *dest, unsigned long src, size_t count) } /* - * Copy memory in absolute mode (kernel to kernel) + * Find CPU that owns swapped prefix page */ -void memcpy_absolute(void *dest, void *src, size_t count) -{ - unsigned long cr0, flags, prefix; - - flags = arch_local_irq_save(); - __ctl_store(cr0, 0, 0); - __ctl_clear_bit(0, 28); /* disable lowcore protection */ - prefix = store_prefix(); - if (prefix) { - local_mcck_disable(); - set_prefix(0); - memcpy(dest, src, count); - set_prefix(prefix); - local_mcck_enable(); - } else { - memcpy(dest, src, count); - } - __ctl_load(cr0, 0, 0); - arch_local_irq_restore(flags); -} - -/* - * Check if physical address is within prefix or zero page - */ -static int is_swapped(phys_addr_t addr) +static int get_swapped_owner(phys_addr_t addr) { phys_addr_t lc; int cpu; - if (addr < sizeof(struct lowcore)) - return 1; for_each_online_cpu(cpu) { lc = virt_to_phys(lowcore_ptr[cpu]); if (addr > lc + sizeof(struct lowcore) - 1 || addr < lc) continue; - return 1; + return cpu; } - return 0; + return -1; } /* @@ -200,17 +175,35 @@ void *xlate_dev_mem_ptr(phys_addr_t addr) { void *ptr = phys_to_virt(addr); void *bounce = ptr; + struct lowcore *abs_lc; + unsigned long flags; unsigned long size; + int this_cpu, cpu; cpus_read_lock(); - preempt_disable(); - if (is_swapped(addr)) { - size = PAGE_SIZE - (addr & ~PAGE_MASK); - bounce = (void *) __get_free_page(GFP_ATOMIC); - if (bounce) - memcpy_absolute(bounce, ptr, size); + this_cpu = get_cpu(); + if (addr >= sizeof(struct lowcore)) { + cpu = get_swapped_owner(addr); + if (cpu < 0) + goto out; } - preempt_enable(); + bounce = (void *)__get_free_page(GFP_ATOMIC); + if (!bounce) + goto out; + size = PAGE_SIZE - (addr & ~PAGE_MASK); + if (addr < sizeof(struct lowcore)) { + abs_lc = get_abs_lowcore(&flags); + ptr = (void *)abs_lc + addr; + memcpy(bounce, ptr, size); + put_abs_lowcore(abs_lc, flags); + } else if (cpu == this_cpu) { + ptr = (void *)(addr - virt_to_phys(lowcore_ptr[cpu])); + memcpy(bounce, ptr, size); + } else { + memcpy(bounce, ptr, size); + } +out: + put_cpu(); cpus_read_unlock(); return bounce; } diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 1c86de9dd87f..a50a809e024f 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -560,6 +560,103 @@ int vmem_add_mapping(unsigned long start, unsigned long size) return ret; } +/* + * Allocate new or return existing page-table entry, but do not map it + * to any physical address. If missing, allocate segment- and region- + * table entries along. Meeting a large segment- or region-table entry + * while traversing is an error, since the function is expected to be + * called against virtual regions reserverd for 4KB mappings only. + */ +static pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc) +{ + pte_t *ptep = NULL; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) { + if (!alloc) + goto out; + p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); + if (!p4d) + goto out; + pgd_populate(&init_mm, pgd, p4d); + } + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) { + if (!alloc) + goto out; + pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); + if (!pud) + goto out; + p4d_populate(&init_mm, p4d, pud); + } + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) { + if (!alloc) + goto out; + pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); + if (!pmd) + goto out; + pud_populate(&init_mm, pud, pmd); + } else if (WARN_ON_ONCE(pud_large(*pud))) { + goto out; + } + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + if (!alloc) + goto out; + pte = vmem_pte_alloc(); + if (!pte) + goto out; + pmd_populate(&init_mm, pmd, pte); + } else if (WARN_ON_ONCE(pmd_large(*pmd))) { + goto out; + } + ptep = pte_offset_kernel(pmd, addr); +out: + return ptep; +} + +int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc) +{ + pte_t *ptep, pte; + + if (!IS_ALIGNED(addr, PAGE_SIZE)) + return -EINVAL; + ptep = vmem_get_alloc_pte(addr, alloc); + if (!ptep) + return -ENOMEM; + __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); + pte = mk_pte_phys(phys, prot); + set_pte(ptep, pte); + return 0; +} + +int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot) +{ + int rc; + + mutex_lock(&vmem_mutex); + rc = __vmem_map_4k_page(addr, phys, prot, true); + mutex_unlock(&vmem_mutex); + return rc; +} + +void vmem_unmap_4k_page(unsigned long addr) +{ + pte_t *ptep; + + mutex_lock(&vmem_mutex); + ptep = virt_to_kpte(addr); + __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); + pte_clear(&init_mm, addr, ptep); + mutex_unlock(&vmem_mutex); +} + /* * map whole physical memory to virtual memory (identity mapping) * we reserve enough space in the vmalloc area for vmemmap to hotplug From 50787755317ddf8c9bb3419a15b87c42237000cb Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 20 Jul 2022 08:32:13 +0200 Subject: [PATCH 08/17] s390/smp,ptdump: add absolute lowcore markers Add "Lowcore Area Start" and "Lowcore Area End" markers that fence pages where absolute lowcore resides. Reviewed-by: Heiko Carstens Signed-off-by: Alexander Gordeev Signed-off-by: Vasily Gorbik --- arch/s390/mm/dump_pagetables.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c index 4250861f90f5..4e2aa56cf9fd 100644 --- a/arch/s390/mm/dump_pagetables.c +++ b/arch/s390/mm/dump_pagetables.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -41,6 +42,8 @@ enum address_markers_idx { VMALLOC_END_NR, MODULES_NR, MODULES_END_NR, + ABS_LOWCORE_NR, + ABS_LOWCORE_END_NR, }; static struct addr_marker address_markers[] = { @@ -66,6 +69,8 @@ static struct addr_marker address_markers[] = { [VMALLOC_END_NR] = {0, "vmalloc Area End"}, [MODULES_NR] = {0, "Modules Area Start"}, [MODULES_END_NR] = {0, "Modules Area End"}, + [ABS_LOWCORE_NR] = {0, "Lowcore Area Start"}, + [ABS_LOWCORE_END_NR] = {0, "Lowcore Area End"}, { -1, NULL } }; @@ -284,6 +289,8 @@ static int pt_dump_init(void) address_markers[AMODE31_END_NR].start_address = __eamode31; address_markers[MODULES_NR].start_address = MODULES_VADDR; address_markers[MODULES_END_NR].start_address = MODULES_END; + address_markers[ABS_LOWCORE_NR].start_address = __abs_lowcore; + address_markers[ABS_LOWCORE_END_NR].start_address = __abs_lowcore + ABS_LOWCORE_MAP_SIZE; address_markers[VMEMMAP_NR].start_address = (unsigned long) vmemmap; address_markers[VMEMMAP_END_NR].start_address = (unsigned long)vmemmap + vmemmap_size; address_markers[VMALLOC_NR].start_address = VMALLOC_START; From 2187582c361fcd33a6ab5e9e6fef5b774c3b8cda Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Wed, 7 Sep 2022 11:38:30 +0200 Subject: [PATCH 09/17] s390/pci: convert high_memory to physical address We use high_memory as a measure for amount of memory available in determining the required minimum size of our IOVA space with the assumption that one rarely maps more than the available memory for DMA. In special cases like mapping significant amounts of memory more than once this can still be tuned with the s390_iommu_apterture kernel parameter. In this use case high_memory is treated as a physical address. As high_memory is a virtual address however this means we need to convert it using virt_to_phys() before use Note that at the moment physical and virtual addresses are identical so this mismatch does not currently cause trouble. Reviewed-by: Matthew Rosato Signed-off-by: Niklas Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/pci/pci_dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c index f46833a25526..227cf0a62800 100644 --- a/arch/s390/pci/pci_dma.c +++ b/arch/s390/pci/pci_dma.c @@ -666,7 +666,7 @@ static int __init dma_alloc_cpu_table_caches(void) int __init zpci_dma_init(void) { - s390_iommu_aperture = (u64)high_memory; + s390_iommu_aperture = (u64)virt_to_phys(high_memory); if (!s390_iommu_aperture_factor) s390_iommu_aperture = ULONG_MAX; else From 14a3a2624285d36624966935ec12f228d876c028 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 2 Sep 2022 10:05:08 +0200 Subject: [PATCH 10/17] s390/dump: save IPL CPU registers once DAT is available Function smp_save_dump_cpus() collects CPU state of a crashed system for secondary CPUs and for the IPL CPU very differently. The Signal Processor stop-and-store-status orders are used for the former while Hardware System Area requests and memcpy_real() routine are called for the latter. In addition a system reset is triggered, which pins smp_save_dump_cpus() function call before CPU and device initialization. Move the collection of IPL CPU state to a later stage when DAT becomes available. That is needed to allow a follow-up rework of memcpy_real() routine. Reviewed-by: Heiko Carstens Signed-off-by: Alexander Gordeev Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/smp.h | 3 +- arch/s390/kernel/setup.c | 7 +++-- arch/s390/kernel/smp.c | 59 +++++++++++++++++-------------------- 3 files changed, 34 insertions(+), 35 deletions(-) diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h index 59cd27255f38..73ed2781073b 100644 --- a/arch/s390/include/asm/smp.h +++ b/arch/s390/include/asm/smp.h @@ -30,7 +30,8 @@ extern void smp_emergency_stop(void); extern int smp_find_processor_id(u16 address); extern int smp_store_status(int cpu); -extern void smp_save_dump_cpus(void); +extern void smp_save_dump_ipl_cpu(void); +extern void smp_save_dump_secondary_cpus(void); extern void smp_yield_cpu(int cpu); extern void smp_cpu_set_polarization(int cpu, int val); extern int smp_cpu_get_polarization(int cpu); diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 8d0a670c9f48..d1390899a8e7 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -1028,10 +1028,10 @@ void __init setup_arch(char **cmdline_p) reserve_crashkernel(); #ifdef CONFIG_CRASH_DUMP /* - * Be aware that smp_save_dump_cpus() triggers a system reset. + * Be aware that smp_save_dump_secondary_cpus() triggers a system reset. * Therefore CPU and device initialization should be done afterwards. */ - smp_save_dump_cpus(); + smp_save_dump_secondary_cpus(); #endif setup_resources(); @@ -1056,6 +1056,9 @@ void __init setup_arch(char **cmdline_p) * in lowcore can now run with DAT enabled. */ setup_lowcore_dat_on(); +#ifdef CONFIG_CRASH_DUMP + smp_save_dump_ipl_cpu(); +#endif /* Setup default console */ conmode_default(); diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 0f646b568d59..14601648914e 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -664,35 +664,36 @@ int smp_store_status(int cpu) * This case does not exist for s390 anymore, setup_arch explicitly * deactivates the elfcorehdr= kernel parameter */ -static __init void smp_save_cpu_vxrs(struct save_area *sa, u16 addr, - bool is_boot_cpu, __vector128 *vxrs) +static bool dump_available(void) { - if (is_boot_cpu) - vxrs = boot_cpu_vector_save_area; - else - __pcpu_sigp_relax(addr, SIGP_STORE_ADDITIONAL_STATUS, __pa(vxrs)); - save_area_add_vxrs(sa, vxrs); + return oldmem_data.start || is_ipl_type_dump(); } -static __init void smp_save_cpu_regs(struct save_area *sa, u16 addr, - bool is_boot_cpu, void *regs) +void __init smp_save_dump_ipl_cpu(void) { - if (is_boot_cpu) - copy_oldmem_kernel(regs, __LC_FPREGS_SAVE_AREA, 512); - else - __pcpu_sigp_relax(addr, SIGP_STORE_STATUS_AT_ADDRESS, __pa(regs)); + struct save_area *sa; + void *regs; + + if (!dump_available()) + return; + sa = save_area_alloc(true); + regs = memblock_alloc(512, 8); + if (!sa || !regs) + panic("could not allocate memory for boot CPU save area\n"); + copy_oldmem_kernel(regs, __LC_FPREGS_SAVE_AREA, 512); save_area_add_regs(sa, regs); + memblock_free(regs, 512); + if (MACHINE_HAS_VX) + save_area_add_vxrs(sa, boot_cpu_vector_save_area); } -void __init smp_save_dump_cpus(void) +void __init smp_save_dump_secondary_cpus(void) { int addr, boot_cpu_addr, max_cpu_addr; struct save_area *sa; - bool is_boot_cpu; void *page; - if (!(oldmem_data.start || is_ipl_type_dump())) - /* No previous system present, normal boot. */ + if (!dump_available()) return; /* Allocate a page as dumping area for the store status sigps */ page = memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); @@ -705,26 +706,20 @@ void __init smp_save_dump_cpus(void) boot_cpu_addr = stap(); max_cpu_addr = SCLP_MAX_CORES << sclp.mtid_prev; for (addr = 0; addr <= max_cpu_addr; addr++) { + if (addr == boot_cpu_addr) + continue; if (__pcpu_sigp_relax(addr, SIGP_SENSE, 0) == SIGP_CC_NOT_OPERATIONAL) continue; - is_boot_cpu = (addr == boot_cpu_addr); - /* Allocate save area */ - sa = save_area_alloc(is_boot_cpu); + sa = save_area_alloc(false); if (!sa) panic("could not allocate memory for save area\n"); - if (MACHINE_HAS_VX) - /* Get the vector registers */ - smp_save_cpu_vxrs(sa, addr, is_boot_cpu, page); - /* - * For a zfcp/nvme dump OLDMEM_BASE == NULL and the registers - * of the boot CPU are stored in the HSA. To retrieve - * these registers an SCLP request is required which is - * done by drivers/s390/char/zcore.c:init_cpu_info() - */ - if (!is_boot_cpu || oldmem_data.start) - /* Get the CPU registers */ - smp_save_cpu_regs(sa, addr, is_boot_cpu, page); + __pcpu_sigp_relax(addr, SIGP_STORE_STATUS_AT_ADDRESS, __pa(page)); + save_area_add_regs(sa, page); + if (MACHINE_HAS_VX) { + __pcpu_sigp_relax(addr, SIGP_STORE_ADDITIONAL_STATUS, __pa(page)); + save_area_add_vxrs(sa, page); + } } memblock_free(page, PAGE_SIZE); diag_amode31_ops.diag308_reset(); From 2f0e8aae26a27fe73d033788f8e92188e7584f41 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Sun, 24 Jul 2022 15:02:16 +0200 Subject: [PATCH 11/17] s390/mm: rework memcpy_real() to avoid DAT-off mode Function memcpy_real() is an univeral data mover that does not require DAT mode to be able reading from a physical address. Its advantage is an ability to read from any address, even those for which no kernel virtual mapping exists. Although memcpy_real() is interrupt-safe, there are no handlers that make use of this function. The compiler instrumentation have to be disabled and separate no-DAT stack used to allow execution of the function once DAT mode is disabled. Rework memcpy_real() to overcome these shortcomings. As result, data copying (which is primarily reading out a crashed system memory by a user process) is executed on a regular stack with enabled interrupts. Also, use of memcpy_real_buf swap buffer becomes unnecessary and the swapping is eliminated. The above is achieved by using a fixed virtual address range that spans a single page and remaps that page repeatedly when memcpy_real() is called for a particular physical address. Reviewed-by: Heiko Carstens Signed-off-by: Alexander Gordeev Signed-off-by: Vasily Gorbik --- arch/s390/boot/startup.c | 5 +- arch/s390/include/asm/maccess.h | 14 ++++ arch/s390/include/asm/pgtable.h | 1 + arch/s390/include/asm/processor.h | 2 - arch/s390/kernel/crash_dump.c | 25 +------- arch/s390/kernel/setup.c | 3 +- arch/s390/mm/maccess.c | 103 +++++++++++++----------------- arch/s390/mm/vmem.c | 2 +- drivers/s390/char/zcore.c | 1 + 9 files changed, 69 insertions(+), 87 deletions(-) create mode 100644 arch/s390/include/asm/maccess.h diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 41b7af7a9365..6e7f01ca53e6 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -17,6 +17,7 @@ unsigned long __bootdata_preserved(__kaslr_offset); unsigned long __bootdata_preserved(__abs_lowcore); +unsigned long __bootdata_preserved(__memcpy_real_area); unsigned long __bootdata(__amode31_base); unsigned long __bootdata_preserved(VMALLOC_START); unsigned long __bootdata_preserved(VMALLOC_END); @@ -182,7 +183,9 @@ static void setup_kernel_memory_layout(void) /* force vmalloc and modules below kasan shadow */ vmax = min(vmax, KASAN_SHADOW_START); #endif - __abs_lowcore = round_down(vmax - ABS_LOWCORE_MAP_SIZE, sizeof(struct lowcore)); + __memcpy_real_area = round_down(vmax - PAGE_SIZE, PAGE_SIZE); + __abs_lowcore = round_down(__memcpy_real_area - ABS_LOWCORE_MAP_SIZE, + sizeof(struct lowcore)); MODULES_END = round_down(__abs_lowcore, _SEGMENT_SIZE); MODULES_VADDR = MODULES_END - MODULES_LEN; VMALLOC_END = MODULES_VADDR; diff --git a/arch/s390/include/asm/maccess.h b/arch/s390/include/asm/maccess.h new file mode 100644 index 000000000000..21ebd9bf34cc --- /dev/null +++ b/arch/s390/include/asm/maccess.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_S390_MACCESS_H +#define __ASM_S390_MACCESS_H + +#include + +struct iov_iter; + +extern unsigned long __memcpy_real_area; +void memcpy_real_init(void); +size_t memcpy_real_iter(struct iov_iter *iter, unsigned long src, size_t count); +int memcpy_real(void *dest, unsigned long src, size_t count); + +#endif /* __ASM_S390_MACCESS_H */ diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 45617c966202..f1cb9391190d 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1780,6 +1780,7 @@ extern void vmem_remove_mapping(unsigned long start, unsigned long size); extern int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc); extern int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot); extern void vmem_unmap_4k_page(unsigned long addr); +extern pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc); extern int s390_enable_sie(void); extern int s390_enable_skey(void); extern void s390_reset_cmma(struct mm_struct *mm); diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 93677ae89e7e..4eb9b7875e13 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -306,8 +306,6 @@ static __always_inline void __noreturn disabled_wait(void) #define ARCH_LOW_ADDRESS_LIMIT 0x7fffffffUL -extern int memcpy_real(void *, unsigned long, size_t); - extern int s390_isolate_bp(void); extern int s390_isolate_bp_guest(void); diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index bad8f47fc5d6..438fe696a4a3 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -21,6 +21,7 @@ #include #include #include +#include #define PTR_ADD(x, y) (((char *) (x)) + ((unsigned long) (y))) #define PTR_SUB(x, y) (((char *) (x)) - ((unsigned long) (y))) @@ -53,8 +54,6 @@ struct save_area { }; static LIST_HEAD(dump_save_areas); -static DEFINE_MUTEX(memcpy_real_mutex); -static char memcpy_real_buf[PAGE_SIZE]; /* * Allocate a save area @@ -116,26 +115,6 @@ void __init save_area_add_vxrs(struct save_area *sa, __vector128 *vxrs) memcpy(sa->vxrs_high, vxrs + 16, 16 * sizeof(__vector128)); } -static size_t copy_to_iter_real(struct iov_iter *iter, unsigned long src, size_t count) -{ - size_t len, copied, res = 0; - - mutex_lock(&memcpy_real_mutex); - while (count) { - len = min(PAGE_SIZE, count); - if (memcpy_real(memcpy_real_buf, src, len)) - break; - copied = copy_to_iter(memcpy_real_buf, len, iter); - count -= copied; - src += copied; - res += copied; - if (copied < len) - break; - } - mutex_unlock(&memcpy_real_mutex); - return res; -} - size_t copy_oldmem_iter(struct iov_iter *iter, unsigned long src, size_t count) { size_t len, copied, res = 0; @@ -156,7 +135,7 @@ size_t copy_oldmem_iter(struct iov_iter *iter, unsigned long src, size_t count) } else { len = count; } - copied = copy_to_iter_real(iter, src, len); + copied = memcpy_real_iter(iter, src, len); } count -= copied; src += copied; diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index d1390899a8e7..ab19ddb09d65 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -74,6 +74,7 @@ #include #include #include +#include #include #include #include "entry.h" @@ -1050,7 +1051,7 @@ void __init setup_arch(char **cmdline_p) * Create kernel page tables and switch to virtual addressing. */ paging_init(); - + memcpy_real_init(); /* * After paging_init created the kernel page table, the new PSWs * in lowcore can now run with DAT enabled. diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c index b8451ddbb3d6..bd1bcbb02938 100644 --- a/arch/s390/mm/maccess.c +++ b/arch/s390/mm/maccess.c @@ -12,12 +12,17 @@ #include #include #include +#include #include #include #include #include #include +unsigned long __bootdata_preserved(__memcpy_real_area); +static __ro_after_init pte_t *memcpy_real_ptep; +static DEFINE_MUTEX(memcpy_real_mutex); + static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t size) { unsigned long aligned, offset, count; @@ -77,75 +82,55 @@ notrace void *s390_kernel_write(void *dst, const void *src, size_t size) return dst; } -static int __no_sanitize_address __memcpy_real(void *dest, void *src, size_t count) +void __init memcpy_real_init(void) { - union register_pair _dst, _src; - int rc = -EFAULT; - - _dst.even = (unsigned long) dest; - _dst.odd = (unsigned long) count; - _src.even = (unsigned long) src; - _src.odd = (unsigned long) count; - asm volatile ( - "0: mvcle %[dst],%[src],0\n" - "1: jo 0b\n" - " lhi %[rc],0\n" - "2:\n" - EX_TABLE(1b,2b) - : [rc] "+&d" (rc), [dst] "+&d" (_dst.pair), [src] "+&d" (_src.pair) - : : "cc", "memory"); - return rc; + memcpy_real_ptep = vmem_get_alloc_pte(__memcpy_real_area, true); + if (!memcpy_real_ptep) + panic("Couldn't setup memcpy real area"); } -static unsigned long __no_sanitize_address _memcpy_real(unsigned long dest, - unsigned long src, - unsigned long count) +size_t memcpy_real_iter(struct iov_iter *iter, unsigned long src, size_t count) { - int irqs_disabled, rc; - unsigned long flags; + size_t len, copied, res = 0; + unsigned long phys, offset; + void *chunk; + pte_t pte; - if (!count) - return 0; - flags = arch_local_irq_save(); - irqs_disabled = arch_irqs_disabled_flags(flags); - if (!irqs_disabled) - trace_hardirqs_off(); - __arch_local_irq_stnsm(0xf8); // disable DAT - rc = __memcpy_real((void *) dest, (void *) src, (size_t) count); - if (flags & PSW_MASK_DAT) - __arch_local_irq_stosm(0x04); // enable DAT - if (!irqs_disabled) - trace_hardirqs_on(); - __arch_local_irq_ssm(flags); - return rc; + while (count) { + phys = src & PAGE_MASK; + offset = src & ~PAGE_MASK; + chunk = (void *)(__memcpy_real_area + offset); + len = min(count, PAGE_SIZE - offset); + pte = mk_pte_phys(phys, PAGE_KERNEL_RO); + + mutex_lock(&memcpy_real_mutex); + if (pte_val(pte) != pte_val(*memcpy_real_ptep)) { + __ptep_ipte(__memcpy_real_area, memcpy_real_ptep, 0, 0, IPTE_GLOBAL); + set_pte(memcpy_real_ptep, pte); + } + copied = copy_to_iter(chunk, len, iter); + mutex_unlock(&memcpy_real_mutex); + + count -= copied; + src += copied; + res += copied; + if (copied < len) + break; + } + return res; } -/* - * Copy memory in real mode (kernel to kernel) - */ int memcpy_real(void *dest, unsigned long src, size_t count) { - unsigned long _dest = (unsigned long)dest; - unsigned long _src = (unsigned long)src; - unsigned long _count = (unsigned long)count; - int rc; + struct iov_iter iter; + struct kvec kvec; - if (S390_lowcore.nodat_stack != 0) { - preempt_disable(); - rc = call_on_stack(3, S390_lowcore.nodat_stack, - unsigned long, _memcpy_real, - unsigned long, _dest, - unsigned long, _src, - unsigned long, _count); - preempt_enable(); - return rc; - } - /* - * This is a really early memcpy_real call, the stacks are - * not set up yet. Just call _memcpy_real on the early boot - * stack - */ - return _memcpy_real(_dest, _src, _count); + kvec.iov_base = dest; + kvec.iov_len = count; + iov_iter_kvec(&iter, WRITE, &kvec, 1, count); + if (memcpy_real_iter(&iter, src, count) < count) + return -EFAULT; + return 0; } /* diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index a50a809e024f..ee1a97078527 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -567,7 +567,7 @@ int vmem_add_mapping(unsigned long start, unsigned long size) * while traversing is an error, since the function is expected to be * called against virtual regions reserverd for 4KB mappings only. */ -static pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc) +pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc) { pte_t *ptep = NULL; pgd_t *pgd; diff --git a/drivers/s390/char/zcore.c b/drivers/s390/char/zcore.c index f6da215ccf9f..6165e6aae762 100644 --- a/drivers/s390/char/zcore.c +++ b/drivers/s390/char/zcore.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "sclp.h" #define TRACE(x...) debug_sprintf_event(zcore_dbf, 1, x) From c0ceb94403880840effeb1990a19357a1232578f Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Sun, 4 Sep 2022 10:32:32 +0200 Subject: [PATCH 12/17] s390/mm,ptdump: add real memory copy page markers Add "Real Memory Copy Area Start" and "Real Memory Copy Area End" markers that fence the page used for real memory copying. Signed-off-by: Alexander Gordeev Signed-off-by: Vasily Gorbik --- arch/s390/mm/dump_pagetables.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c index 4e2aa56cf9fd..9953819d7959 100644 --- a/arch/s390/mm/dump_pagetables.c +++ b/arch/s390/mm/dump_pagetables.c @@ -11,6 +11,7 @@ #include #include #include +#include static unsigned long max_addr; @@ -44,6 +45,8 @@ enum address_markers_idx { MODULES_END_NR, ABS_LOWCORE_NR, ABS_LOWCORE_END_NR, + MEMCPY_REAL_NR, + MEMCPY_REAL_END_NR, }; static struct addr_marker address_markers[] = { @@ -71,6 +74,8 @@ static struct addr_marker address_markers[] = { [MODULES_END_NR] = {0, "Modules Area End"}, [ABS_LOWCORE_NR] = {0, "Lowcore Area Start"}, [ABS_LOWCORE_END_NR] = {0, "Lowcore Area End"}, + [MEMCPY_REAL_NR] = {0, "Real Memory Copy Area Start"}, + [MEMCPY_REAL_END_NR] = {0, "Real Memory Copy Area End"}, { -1, NULL } }; @@ -291,6 +296,8 @@ static int pt_dump_init(void) address_markers[MODULES_END_NR].start_address = MODULES_END; address_markers[ABS_LOWCORE_NR].start_address = __abs_lowcore; address_markers[ABS_LOWCORE_END_NR].start_address = __abs_lowcore + ABS_LOWCORE_MAP_SIZE; + address_markers[MEMCPY_REAL_NR].start_address = __memcpy_real_area; + address_markers[MEMCPY_REAL_END_NR].start_address = __memcpy_real_area + PAGE_SIZE; address_markers[VMEMMAP_NR].start_address = (unsigned long) vmemmap; address_markers[VMEMMAP_END_NR].start_address = (unsigned long)vmemmap + vmemmap_size; address_markers[VMALLOC_NR].start_address = VMALLOC_START; From fba07cd4dd8fb4833015801a83f945b2d65a5c4b Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 8 Sep 2022 14:23:02 +0200 Subject: [PATCH 13/17] s390/mm: uninline copy_oldmem_kernel() function Uninline copy_oldmem_kernel() function and make it consistent with a very similar memcpy_real() implementation, by moving to code to crash_dump.c, where it actually belongs. Reviewed-by: Heiko Carstens Signed-off-by: Alexander Gordeev Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/maccess.h | 3 +++ arch/s390/include/asm/os_info.h | 14 -------------- arch/s390/kernel/crash_dump.c | 15 ++++++++++++++- arch/s390/kernel/os_info.c | 1 + arch/s390/kernel/smp.c | 1 + 5 files changed, 19 insertions(+), 15 deletions(-) diff --git a/arch/s390/include/asm/maccess.h b/arch/s390/include/asm/maccess.h index 21ebd9bf34cc..c7fa838cf6b9 100644 --- a/arch/s390/include/asm/maccess.h +++ b/arch/s390/include/asm/maccess.h @@ -10,5 +10,8 @@ extern unsigned long __memcpy_real_area; void memcpy_real_init(void); size_t memcpy_real_iter(struct iov_iter *iter, unsigned long src, size_t count); int memcpy_real(void *dest, unsigned long src, size_t count); +#ifdef CONFIG_CRASH_DUMP +int copy_oldmem_kernel(void *dst, unsigned long src, size_t count); +#endif #endif /* __ASM_S390_MACCESS_H */ diff --git a/arch/s390/include/asm/os_info.h b/arch/s390/include/asm/os_info.h index 85248d8fee0c..0d1c74a7a650 100644 --- a/arch/s390/include/asm/os_info.h +++ b/arch/s390/include/asm/os_info.h @@ -41,20 +41,6 @@ u32 os_info_csum(struct os_info *os_info); #ifdef CONFIG_CRASH_DUMP void *os_info_old_entry(int nr, unsigned long *size); -size_t copy_oldmem_iter(struct iov_iter *iter, unsigned long src, size_t count); - -static inline int copy_oldmem_kernel(void *dst, unsigned long src, size_t count) -{ - struct iov_iter iter; - struct kvec kvec; - - kvec.iov_base = dst; - kvec.iov_len = count; - iov_iter_kvec(&iter, WRITE, &kvec, 1, count); - if (copy_oldmem_iter(&iter, src, count) < count) - return -EFAULT; - return 0; -} #else static inline void *os_info_old_entry(int nr, unsigned long *size) { diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index 438fe696a4a3..dd74fe664ed1 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -115,7 +115,7 @@ void __init save_area_add_vxrs(struct save_area *sa, __vector128 *vxrs) memcpy(sa->vxrs_high, vxrs + 16, 16 * sizeof(__vector128)); } -size_t copy_oldmem_iter(struct iov_iter *iter, unsigned long src, size_t count) +static size_t copy_oldmem_iter(struct iov_iter *iter, unsigned long src, size_t count) { size_t len, copied, res = 0; @@ -146,6 +146,19 @@ size_t copy_oldmem_iter(struct iov_iter *iter, unsigned long src, size_t count) return res; } +int copy_oldmem_kernel(void *dst, unsigned long src, size_t count) +{ + struct iov_iter iter; + struct kvec kvec; + + kvec.iov_base = dst; + kvec.iov_len = count; + iov_iter_kvec(&iter, WRITE, &kvec, 1, count); + if (copy_oldmem_iter(&iter, src, count) < count) + return -EFAULT; + return 0; +} + /* * Copy one page from "oldmem" */ diff --git a/arch/s390/kernel/os_info.c b/arch/s390/kernel/os_info.c index 506ccb74d2d0..ec0bd9457e90 100644 --- a/arch/s390/kernel/os_info.c +++ b/arch/s390/kernel/os_info.c @@ -15,6 +15,7 @@ #include #include #include +#include #include /* diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 14601648914e..0031325ce4bc 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -55,6 +55,7 @@ #include #include #include +#include #include "entry.h" enum { From 9267bdd8194f8166ddfddd3d5577b6ba1632a165 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 16 Sep 2022 11:08:20 +0200 Subject: [PATCH 14/17] s390/mm: fix no previous prototype warnings in maccess.c Fix -Wmissing-prototypes warnings caused by missing maccess.h include. Reported-by: kernel test robot Fixes: 2f0e8aae26a2 ("s390/mm: rework memcpy_real() to avoid DAT-off mode") Signed-off-by: Alexander Gordeev Signed-off-by: Vasily Gorbik --- arch/s390/mm/maccess.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c index bd1bcbb02938..1571cdcb0c50 100644 --- a/arch/s390/mm/maccess.c +++ b/arch/s390/mm/maccess.c @@ -18,6 +18,7 @@ #include #include #include +#include unsigned long __bootdata_preserved(__memcpy_real_area); static __ro_after_init pte_t *memcpy_real_ptep; From c432fefe8e6262bf3d288ab82d006cfafa78a139 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Thu, 30 Jun 2022 11:53:48 +0200 Subject: [PATCH 15/17] s390/pai: Add support for PAI Extension 1 NNPA counters PMU device driver perf_paiext supports Processor Activity Instrumentation Extension (PAIE1), available with IBM z16: - maps a 512 byte block to lowcore address 0x1508 called PAIE1 control block. - maps a 1024 byte block at PAIE1 control block entry with index 2. - uses control register bit 14 to enable PAIE1 control block lookup. - turn PAIE1 nnpa counting on and off by setting bit 63 in PAIE1 control block entry with index 2. - creates a sample with raw data on each context switch out when at context switch some mapped counters have a value of nonzero. This device driver only supports CPU wide context, no task context is allowed. Support for counting: - one or more counters can be specified using perf stat -e pai_ext/xxx/ where xxx stands for the counter event name. Multiple invocation of this command is possible. The counter names are listed in /sys/devices/pai_ext/events directory. - one special counters can be specified using perf stat -e pai_ext/NNPA_ALL/ which returns the sum of all incremented nnpa counters. - multiple counting events can run in parallel. Support for Sampling: - one event pai_ext/NNPA_ALL/ is reserved for sampling. The event collects data at context switch out and saves them in the ring buffer. - no multiple invocations are possible. The PAIE1 nnpa counter events are system wide. No task context is supported. Therefore some restrictions documented in function paiext_busy() apply. Extend qpaci assembly instruction to query supported memory mapped nnpa counters. It returns the number of counters (no holes allowed in that range). PAIE1 nnpa counter events can not be created when a CPU hot plug add is processed. This means a CPU hot plug add does not get the necessary PAIE1 event to record PAIE1 nnpa counter increments on the newly added CPU. CPU hot plug remove removes the event and terminates the counting of PAIE1 counters immediately. Signed-off-by: Thomas Richter Reviewed-by: Sumanth Korikkar Reviewed-by: Sven Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/ctl_reg.h | 3 +- arch/s390/include/asm/lowcore.h | 4 +- arch/s390/include/asm/pai.h | 6 +- arch/s390/kernel/Makefile | 2 +- arch/s390/kernel/perf_pai_ext.c | 671 ++++++++++++++++++++++++++++++++ 5 files changed, 682 insertions(+), 4 deletions(-) create mode 100644 arch/s390/kernel/perf_pai_ext.c diff --git a/arch/s390/include/asm/ctl_reg.h b/arch/s390/include/asm/ctl_reg.h index 267a8f88e143..adf7d8cdac7e 100644 --- a/arch/s390/include/asm/ctl_reg.h +++ b/arch/s390/include/asm/ctl_reg.h @@ -95,7 +95,8 @@ union ctlreg0 { Interruption-Filtering Override */ unsigned long : 3; unsigned long ccc : 1; /* Cryptography counter control */ - unsigned long : 18; + unsigned long pec : 1; /* PAI extension control */ + unsigned long : 17; unsigned long : 3; unsigned long lap : 1; /* Low-address-protection control */ unsigned long : 4; diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index 26fe5e535728..8aa1f6530a3e 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -203,7 +203,9 @@ struct lowcore { __u8 pad_0x1400[0x1500-0x1400]; /* 0x1400 */ /* Cryptography-counter designation */ __u64 ccd; /* 0x1500 */ - __u8 pad_0x1508[0x1800-0x1508]; /* 0x1508 */ + /* AI-extension counter designation */ + __u64 aicd; /* 0x1508 */ + __u8 pad_0x1510[0x1800-0x1510]; /* 0x1510 */ /* Transaction abort diagnostic block */ struct pgm_tdb pgm_tdb; /* 0x1800 */ diff --git a/arch/s390/include/asm/pai.h b/arch/s390/include/asm/pai.h index 5b7e33ac6f0b..1a8a6b15d121 100644 --- a/arch/s390/include/asm/pai.h +++ b/arch/s390/include/asm/pai.h @@ -17,7 +17,9 @@ struct qpaci_info_block { struct { u64 : 8; u64 num_cc : 8; /* # of supported crypto counters */ - u64 : 48; + u64 : 9; + u64 num_nnpa : 7; /* # of supported NNPA counters */ + u64 : 32; }; }; @@ -42,6 +44,8 @@ static inline int qpaci(struct qpaci_info_block *info) #define PAI_CRYPTO_BASE 0x1000 /* First event number */ #define PAI_CRYPTO_MAXCTR 256 /* Max # of event counters */ #define PAI_CRYPTO_KERNEL_OFFSET 2048 +#define PAI_NNPA_BASE 0x1800 /* First event number */ +#define PAI_NNPA_MAXCTR 128 /* Max # of event counters */ DECLARE_STATIC_KEY_FALSE(pai_key); diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 45e4b2f41e05..70b776863f84 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -72,7 +72,7 @@ obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o perf_cpum_cf_common.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf.o perf_cpum_sf.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_events.o perf_regs.o -obj-$(CONFIG_PERF_EVENTS) += perf_pai_crypto.o +obj-$(CONFIG_PERF_EVENTS) += perf_pai_crypto.o perf_pai_ext.o obj-$(CONFIG_TRACEPOINTS) += trace.o obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c new file mode 100644 index 000000000000..d5c7c1e30c17 --- /dev/null +++ b/arch/s390/kernel/perf_pai_ext.c @@ -0,0 +1,671 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Performance event support - Processor Activity Instrumentation Extension + * Facility + * + * Copyright IBM Corp. 2022 + * Author(s): Thomas Richter + */ +#define KMSG_COMPONENT "pai_ext" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define PAIE1_CB_SZ 0x200 /* Size of PAIE1 control block */ +#define PAIE1_CTRBLOCK_SZ 0x400 /* Size of PAIE1 counter blocks */ + +static debug_info_t *paiext_dbg; +static unsigned int paiext_cnt; /* Extracted with QPACI instruction */ + +enum paiext_mode { + PAI_MODE_NONE, + PAI_MODE_SAMPLING, + PAI_MODE_COUNTER, +}; + +struct pai_userdata { + u16 num; + u64 value; +} __packed; + +/* Create the PAI extension 1 control block area. + * The PAI extension control block 1 is pointed to by lowcore + * address 0x1508 for each CPU. This control block is 512 bytes in size + * and requires a 512 byte boundary alignment. + */ +struct paiext_cb { /* PAI extension 1 control block */ + u64 header; /* Not used */ + u64 reserved1; + u64 acc; /* Addr to analytics counter control block */ + u8 reserved2[488]; +} __packed; + +struct paiext_map { + unsigned long *area; /* Area for CPU to store counters */ + struct pai_userdata *save; /* Area to store non-zero counters */ + enum paiext_mode mode; /* Type of event */ + unsigned int active_events; /* # of PAI Extension users */ + unsigned int refcnt; + struct perf_event *event; /* Perf event for sampling */ + struct paiext_cb *paiext_cb; /* PAI extension control block area */ +}; + +struct paiext_mapptr { + struct paiext_map *mapptr; +}; + +static struct paiext_root { /* Anchor to per CPU data */ + int refcnt; /* Overall active events */ + struct paiext_mapptr __percpu *mapptr; +} paiext_root; + +/* Free per CPU data when the last event is removed. */ +static void paiext_root_free(void) +{ + if (!--paiext_root.refcnt) { + free_percpu(paiext_root.mapptr); + paiext_root.mapptr = NULL; + } +} + +/* On initialization of first event also allocate per CPU data dynamically. + * Start with an array of pointers, the array size is the maximum number of + * CPUs possible, which might be larger than the number of CPUs currently + * online. + */ +static int paiext_root_alloc(void) +{ + if (++paiext_root.refcnt == 1) { + /* The memory is already zeroed. */ + paiext_root.mapptr = alloc_percpu(struct paiext_mapptr); + if (!paiext_root.mapptr) { + /* Returing without refcnt adjustment is ok. The + * error code is handled by paiext_alloc() which + * decrements refcnt when an event can not be + * created. + */ + return -ENOMEM; + } + } + return 0; +} + +/* Protects against concurrent increment of sampler and counter member + * increments at the same time and prohibits concurrent execution of + * counting and sampling events. + * Ensures that analytics counter block is deallocated only when the + * sampling and counting on that cpu is zero. + * For details see paiext_alloc(). + */ +static DEFINE_MUTEX(paiext_reserve_mutex); + +/* Free all memory allocated for event counting/sampling setup */ +static void paiext_free(struct paiext_mapptr *mp) +{ + kfree(mp->mapptr->area); + kfree(mp->mapptr->paiext_cb); + kvfree(mp->mapptr->save); + kfree(mp->mapptr); + mp->mapptr = NULL; +} + +/* Release the PMU if event is the last perf event */ +static void paiext_event_destroy(struct perf_event *event) +{ + struct paiext_mapptr *mp = per_cpu_ptr(paiext_root.mapptr, event->cpu); + struct paiext_map *cpump = mp->mapptr; + + mutex_lock(&paiext_reserve_mutex); + cpump->event = NULL; + if (!--cpump->refcnt) /* Last reference gone */ + paiext_free(mp); + paiext_root_free(); + mutex_unlock(&paiext_reserve_mutex); + debug_sprintf_event(paiext_dbg, 4, "%s cpu %d mapptr %p\n", __func__, + event->cpu, mp->mapptr); + +} + +/* Used to avoid races in checking concurrent access of counting and + * sampling for pai_extension events. + * + * Only one instance of event pai_ext/NNPA_ALL/ for sampling is + * allowed and when this event is running, no counting event is allowed. + * Several counting events are allowed in parallel, but no sampling event + * is allowed while one (or more) counting events are running. + * + * This function is called in process context and it is safe to block. + * When the event initialization functions fails, no other call back will + * be invoked. + * + * Allocate the memory for the event. + */ +static int paiext_alloc(struct perf_event_attr *a, struct perf_event *event) +{ + struct paiext_mapptr *mp; + struct paiext_map *cpump; + int rc; + + mutex_lock(&paiext_reserve_mutex); + + rc = paiext_root_alloc(); + if (rc) + goto unlock; + + mp = per_cpu_ptr(paiext_root.mapptr, event->cpu); + cpump = mp->mapptr; + if (!cpump) { /* Paiext_map allocated? */ + rc = -ENOMEM; + cpump = kzalloc(sizeof(*cpump), GFP_KERNEL); + if (!cpump) + goto unlock; + + /* Allocate memory for counter area and counter extraction. + * These are + * - a 512 byte block and requires 512 byte boundary alignment. + * - a 1KB byte block and requires 1KB boundary alignment. + * Only the first counting event has to allocate the area. + * + * Note: This works with commit 59bb47985c1d by default. + * Backporting this to kernels without this commit might + * need adjustment. + */ + mp->mapptr = cpump; + cpump->area = kzalloc(PAIE1_CTRBLOCK_SZ, GFP_KERNEL); + cpump->paiext_cb = kzalloc(PAIE1_CB_SZ, GFP_KERNEL); + cpump->save = kvmalloc_array(paiext_cnt + 1, + sizeof(struct pai_userdata), + GFP_KERNEL); + if (!cpump->save || !cpump->area || !cpump->paiext_cb) { + paiext_free(mp); + goto unlock; + } + cpump->mode = a->sample_period ? PAI_MODE_SAMPLING + : PAI_MODE_COUNTER; + } else { + /* Multiple invocation, check whats active. + * Supported are multiple counter events or only one sampling + * event concurrently at any one time. + */ + if (cpump->mode == PAI_MODE_SAMPLING || + (cpump->mode == PAI_MODE_COUNTER && a->sample_period)) { + rc = -EBUSY; + goto unlock; + } + } + + rc = 0; + cpump->event = event; + ++cpump->refcnt; + +unlock: + if (rc) { + /* Error in allocation of event, decrement anchor. Since + * the event in not created, its destroy() function is never + * invoked. Adjust the reference counter for the anchor. + */ + paiext_root_free(); + } + mutex_unlock(&paiext_reserve_mutex); + /* If rc is non-zero, no increment of counter/sampler was done. */ + return rc; +} + +/* The PAI extension 1 control block supports up to 128 entries. Return + * the index within PAIE1_CB given the event number. Also validate event + * number. + */ +static int paiext_event_valid(struct perf_event *event) +{ + u64 cfg = event->attr.config; + + if (cfg >= PAI_NNPA_BASE && cfg <= PAI_NNPA_BASE + paiext_cnt) { + /* Offset NNPA in paiext_cb */ + event->hw.config_base = offsetof(struct paiext_cb, acc); + return 0; + } + return -EINVAL; +} + +/* Might be called on different CPU than the one the event is intended for. */ +static int paiext_event_init(struct perf_event *event) +{ + struct perf_event_attr *a = &event->attr; + int rc; + + /* PMU pai_ext registered as PERF_TYPE_RAW, check event type */ + if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type) + return -ENOENT; + /* PAI extension event must be valid and in supported range */ + rc = paiext_event_valid(event); + if (rc) + return rc; + /* Allow only CPU wide operation, no process context for now. */ + if (event->hw.target || event->cpu == -1) + return -ENOENT; + /* Allow only event NNPA_ALL for sampling. */ + if (a->sample_period && a->config != PAI_NNPA_BASE) + return -EINVAL; + /* Prohibit exclude_user event selection */ + if (a->exclude_user) + return -EINVAL; + + rc = paiext_alloc(a, event); + if (rc) + return rc; + event->hw.last_tag = 0; + event->destroy = paiext_event_destroy; + + if (a->sample_period) { + a->sample_period = 1; + a->freq = 0; + /* Register for paicrypt_sched_task() to be called */ + event->attach_state |= PERF_ATTACH_SCHED_CB; + /* Add raw data which are the memory mapped counters */ + a->sample_type |= PERF_SAMPLE_RAW; + /* Turn off inheritance */ + a->inherit = 0; + } + + return 0; +} + +static u64 paiext_getctr(struct paiext_map *cpump, int nr) +{ + return cpump->area[nr]; +} + +/* Read the counter values. Return value from location in buffer. For event + * NNPA_ALL sum up all events. + */ +static u64 paiext_getdata(struct perf_event *event) +{ + struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); + struct paiext_map *cpump = mp->mapptr; + u64 sum = 0; + int i; + + if (event->attr.config != PAI_NNPA_BASE) + return paiext_getctr(cpump, event->attr.config - PAI_NNPA_BASE); + + for (i = 1; i <= paiext_cnt; i++) + sum += paiext_getctr(cpump, i); + + return sum; +} + +static u64 paiext_getall(struct perf_event *event) +{ + return paiext_getdata(event); +} + +static void paiext_read(struct perf_event *event) +{ + u64 prev, new, delta; + + prev = local64_read(&event->hw.prev_count); + new = paiext_getall(event); + local64_set(&event->hw.prev_count, new); + delta = new - prev; + local64_add(delta, &event->count); +} + +static void paiext_start(struct perf_event *event, int flags) +{ + u64 sum; + + if (event->hw.last_tag) + return; + event->hw.last_tag = 1; + sum = paiext_getall(event); /* Get current value */ + local64_set(&event->hw.prev_count, sum); + local64_set(&event->count, 0); +} + +static int paiext_add(struct perf_event *event, int flags) +{ + struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); + struct paiext_map *cpump = mp->mapptr; + struct paiext_cb *pcb = cpump->paiext_cb; + + if (++cpump->active_events == 1) { + S390_lowcore.aicd = virt_to_phys(cpump->paiext_cb); + pcb->acc = virt_to_phys(cpump->area) | 0x1; + /* Enable CPU instruction lookup for PAIE1 control block */ + __ctl_set_bit(0, 49); + debug_sprintf_event(paiext_dbg, 4, "%s 1508 %llx acc %llx\n", + __func__, S390_lowcore.aicd, pcb->acc); + } + if (flags & PERF_EF_START && !event->attr.sample_period) { + /* Only counting needs initial counter value */ + paiext_start(event, PERF_EF_RELOAD); + } + event->hw.state = 0; + if (event->attr.sample_period) { + cpump->event = event; + perf_sched_cb_inc(event->pmu); + } + return 0; +} + +static void paiext_stop(struct perf_event *event, int flags) +{ + paiext_read(event); + event->hw.state = PERF_HES_STOPPED; +} + +static void paiext_del(struct perf_event *event, int flags) +{ + struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); + struct paiext_map *cpump = mp->mapptr; + struct paiext_cb *pcb = cpump->paiext_cb; + + if (event->attr.sample_period) + perf_sched_cb_dec(event->pmu); + if (!event->attr.sample_period) { + /* Only counting needs to read counter */ + paiext_stop(event, PERF_EF_UPDATE); + } + if (--cpump->active_events == 0) { + /* Disable CPU instruction lookup for PAIE1 control block */ + __ctl_clear_bit(0, 49); + pcb->acc = 0; + S390_lowcore.aicd = 0; + debug_sprintf_event(paiext_dbg, 4, "%s 1508 %llx acc %llx\n", + __func__, S390_lowcore.aicd, pcb->acc); + } +} + +/* Create raw data and save it in buffer. Returns number of bytes copied. + * Saves only positive counter entries of the form + * 2 bytes: Number of counter + * 8 bytes: Value of counter + */ +static size_t paiext_copy(struct paiext_map *cpump) +{ + struct pai_userdata *userdata = cpump->save; + int i, outidx = 0; + + for (i = 1; i <= paiext_cnt; i++) { + u64 val = paiext_getctr(cpump, i); + + if (val) { + userdata[outidx].num = i; + userdata[outidx].value = val; + outidx++; + } + } + return outidx * sizeof(*userdata); +} + +/* Write sample when one or more counters values are nonzero. + * + * Note: The function paiext_sched_task() and paiext_push_sample() are not + * invoked after function paiext_del() has been called because of function + * perf_sched_cb_dec(). + * The function paiext_sched_task() and paiext_push_sample() are only + * called when sampling is active. Function perf_sched_cb_inc() + * has been invoked to install function paiext_sched_task() as call back + * to run at context switch time (see paiext_add()). + * + * This causes function perf_event_context_sched_out() and + * perf_event_context_sched_in() to check whether the PMU has installed an + * sched_task() callback. That callback is not active after paiext_del() + * returns and has deleted the event on that CPU. + */ +static int paiext_push_sample(void) +{ + struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); + struct paiext_map *cpump = mp->mapptr; + struct perf_event *event = cpump->event; + struct perf_sample_data data; + struct perf_raw_record raw; + struct pt_regs regs; + size_t rawsize; + int overflow; + + rawsize = paiext_copy(cpump); + if (!rawsize) /* No incremented counters */ + return 0; + + /* Setup perf sample */ + memset(®s, 0, sizeof(regs)); + memset(&raw, 0, sizeof(raw)); + memset(&data, 0, sizeof(data)); + perf_sample_data_init(&data, 0, event->hw.last_period); + if (event->attr.sample_type & PERF_SAMPLE_TID) { + data.tid_entry.pid = task_tgid_nr(current); + data.tid_entry.tid = task_pid_nr(current); + } + if (event->attr.sample_type & PERF_SAMPLE_TIME) + data.time = event->clock(); + if (event->attr.sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) + data.id = event->id; + if (event->attr.sample_type & PERF_SAMPLE_CPU) + data.cpu_entry.cpu = smp_processor_id(); + if (event->attr.sample_type & PERF_SAMPLE_RAW) { + raw.frag.size = rawsize; + raw.frag.data = cpump->save; + raw.size = raw.frag.size; + data.raw = &raw; + } + + overflow = perf_event_overflow(event, &data, ®s); + perf_event_update_userpage(event); + /* Clear lowcore area after read */ + memset(cpump->area, 0, PAIE1_CTRBLOCK_SZ); + return overflow; +} + +/* Called on schedule-in and schedule-out. No access to event structure, + * but for sampling only event NNPA_ALL is allowed. + */ +static void paiext_sched_task(struct perf_event_context *ctx, bool sched_in) +{ + /* We started with a clean page on event installation. So read out + * results on schedule_out and if page was dirty, clear values. + */ + if (!sched_in) + paiext_push_sample(); +} + +/* Attribute definitions for pai extension1 interface. As with other CPU + * Measurement Facilities, there is one attribute per mapped counter. + * The number of mapped counters may vary per machine generation. Use + * the QUERY PROCESSOR ACTIVITY COUNTER INFORMATION (QPACI) instruction + * to determine the number of mapped counters. The instructions returns + * a positive number, which is the highest number of supported counters. + * All counters less than this number are also supported, there are no + * holes. A returned number of zero means no support for mapped counters. + * + * The identification of the counter is a unique number. The chosen range + * is 0x1800 + offset in mapped kernel page. + * All CPU Measurement Facility counters identifiers must be unique and + * the numbers from 0 to 496 are already used for the CPU Measurement + * Counter facility. Number 0x1000 to 0x103e are used for PAI cryptography + * counters. + * Numbers 0xb0000, 0xbc000 and 0xbd000 are already + * used for the CPU Measurement Sampling facility. + */ +PMU_FORMAT_ATTR(event, "config:0-63"); + +static struct attribute *paiext_format_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group paiext_events_group = { + .name = "events", + .attrs = NULL, /* Filled in attr_event_init() */ +}; + +static struct attribute_group paiext_format_group = { + .name = "format", + .attrs = paiext_format_attr, +}; + +static const struct attribute_group *paiext_attr_groups[] = { + &paiext_events_group, + &paiext_format_group, + NULL, +}; + +/* Performance monitoring unit for mapped counters */ +static struct pmu paiext = { + .task_ctx_nr = perf_invalid_context, + .event_init = paiext_event_init, + .add = paiext_add, + .del = paiext_del, + .start = paiext_start, + .stop = paiext_stop, + .read = paiext_read, + .sched_task = paiext_sched_task, + .attr_groups = paiext_attr_groups, +}; + +/* List of symbolic PAI extension 1 NNPA counter names. */ +static const char * const paiext_ctrnames[] = { + [0] = "NNPA_ALL", + [1] = "NNPA_ADD", + [2] = "NNPA_SUB", + [3] = "NNPA_MUL", + [4] = "NNPA_DIV", + [5] = "NNPA_MIN", + [6] = "NNPA_MAX", + [7] = "NNPA_LOG", + [8] = "NNPA_EXP", + [9] = "NNPA_IBM_RESERVED_9", + [10] = "NNPA_RELU", + [11] = "NNPA_TANH", + [12] = "NNPA_SIGMOID", + [13] = "NNPA_SOFTMAX", + [14] = "NNPA_BATCHNORM", + [15] = "NNPA_MAXPOOL2D", + [16] = "NNPA_AVGPOOL2D", + [17] = "NNPA_LSTMACT", + [18] = "NNPA_GRUACT", + [19] = "NNPA_CONVOLUTION", + [20] = "NNPA_MATMUL_OP", + [21] = "NNPA_MATMUL_OP_BCAST23", + [22] = "NNPA_SMALLBATCH", + [23] = "NNPA_LARGEDIM", + [24] = "NNPA_SMALLTENSOR", + [25] = "NNPA_1MFRAME", + [26] = "NNPA_2GFRAME", + [27] = "NNPA_ACCESSEXCEPT", +}; + +static void __init attr_event_free(struct attribute **attrs, int num) +{ + struct perf_pmu_events_attr *pa; + struct device_attribute *dap; + int i; + + for (i = 0; i < num; i++) { + dap = container_of(attrs[i], struct device_attribute, attr); + pa = container_of(dap, struct perf_pmu_events_attr, attr); + kfree(pa); + } + kfree(attrs); +} + +static int __init attr_event_init_one(struct attribute **attrs, int num) +{ + struct perf_pmu_events_attr *pa; + + pa = kzalloc(sizeof(*pa), GFP_KERNEL); + if (!pa) + return -ENOMEM; + + sysfs_attr_init(&pa->attr.attr); + pa->id = PAI_NNPA_BASE + num; + pa->attr.attr.name = paiext_ctrnames[num]; + pa->attr.attr.mode = 0444; + pa->attr.show = cpumf_events_sysfs_show; + pa->attr.store = NULL; + attrs[num] = &pa->attr.attr; + return 0; +} + +/* Create PMU sysfs event attributes on the fly. */ +static int __init attr_event_init(void) +{ + struct attribute **attrs; + int ret, i; + + attrs = kmalloc_array(ARRAY_SIZE(paiext_ctrnames) + 1, sizeof(*attrs), + GFP_KERNEL); + if (!attrs) + return -ENOMEM; + for (i = 0; i < ARRAY_SIZE(paiext_ctrnames); i++) { + ret = attr_event_init_one(attrs, i); + if (ret) { + attr_event_free(attrs, i - 1); + return ret; + } + } + attrs[i] = NULL; + paiext_events_group.attrs = attrs; + return 0; +} + +static int __init paiext_init(void) +{ + struct qpaci_info_block ib; + int rc = -ENOMEM; + + if (!test_facility(197)) + return 0; + + qpaci(&ib); + paiext_cnt = ib.num_nnpa; + if (paiext_cnt >= PAI_NNPA_MAXCTR) + paiext_cnt = PAI_NNPA_MAXCTR; + if (!paiext_cnt) + return 0; + + rc = attr_event_init(); + if (rc) { + pr_err("Creation of PMU " KMSG_COMPONENT " /sysfs failed\n"); + return rc; + } + + /* Setup s390dbf facility */ + paiext_dbg = debug_register(KMSG_COMPONENT, 2, 256, 128); + if (!paiext_dbg) { + pr_err("Registration of s390dbf " KMSG_COMPONENT " failed\n"); + rc = -ENOMEM; + goto out_init; + } + debug_register_view(paiext_dbg, &debug_sprintf_view); + + rc = perf_pmu_register(&paiext, KMSG_COMPONENT, -1); + if (rc) { + pr_err("Registration of " KMSG_COMPONENT " PMU failed with " + "rc=%i\n", rc); + goto out_pmu; + } + + return 0; + +out_pmu: + debug_unregister_view(paiext_dbg, &debug_sprintf_view); + debug_unregister(paiext_dbg); +out_init: + attr_event_free(paiext_events_group.attrs, + ARRAY_SIZE(paiext_ctrnames) + 1); + return rc; +} + +device_initcall(paiext_init); From 4b39d40ea1a076ac643c7383cdb1cb66617fe860 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Wed, 14 Sep 2022 12:52:33 +0000 Subject: [PATCH 16/17] s390/cio: remove unused ccw_device_force_console() declaration ccw_device_force_console() has been removed by commit 8cc0dcfdc1c0 ("s390/cio: remove pm support from ccw bus driver"), so remove the declaration, too. Signed-off-by: Gaosheng Cui Acked-by: Vineeth Vijayan Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/ccwdev.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/s390/include/asm/ccwdev.h b/arch/s390/include/asm/ccwdev.h index d4e90f2ba77e..bd1596810cc1 100644 --- a/arch/s390/include/asm/ccwdev.h +++ b/arch/s390/include/asm/ccwdev.h @@ -214,7 +214,6 @@ extern struct ccw_device *ccw_device_create_console(struct ccw_driver *); extern void ccw_device_destroy_console(struct ccw_device *); extern int ccw_device_enable_console(struct ccw_device *); extern void ccw_device_wait_idle(struct ccw_device *); -extern int ccw_device_force_console(struct ccw_device *); extern void *ccw_device_dma_zalloc(struct ccw_device *cdev, size_t size); extern void ccw_device_dma_free(struct ccw_device *cdev, From 8fb65e05bd60058e15842e511b3ee5299ac51829 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Fri, 9 Sep 2022 14:34:41 +0200 Subject: [PATCH 17/17] s390/pci: remove unused bus_next field from struct zpci_dev This field was added in commit 44510d6fa0c0 ("s390/pci: Handling multifunctions") but is an unused remnant of an earlier version where the devices on the virtual bus were connected in a linked list instead of a fixed 256 entry array of pointers. It is also not used for the list of busses as that is threaded through struct zpci_bus not through struct zpci_dev. Reviewed-by: Pierre Morel Signed-off-by: Niklas Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/pci.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 7b4cdadbc023..108e732d7b14 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -117,7 +117,6 @@ struct zpci_bus { struct zpci_dev { struct zpci_bus *zbus; struct list_head entry; /* list of all zpci_devices, needed for hotplug, etc. */ - struct list_head bus_next; struct kref kref; struct hotplug_slot hotplug_slot;