diff --git a/Documentation/devicetree/bindings/interrupt-controller/al,alpine-msix.txt b/Documentation/devicetree/bindings/interrupt-controller/al,alpine-msix.txt new file mode 100644 index 000000000000..f6f1c14bf99b --- /dev/null +++ b/Documentation/devicetree/bindings/interrupt-controller/al,alpine-msix.txt @@ -0,0 +1,26 @@ +Alpine MSIX controller + +See arm,gic-v3.txt for SPI and MSI definitions. + +Required properties: + +- compatible: should be "al,alpine-msix" +- reg: physical base address and size of the registers +- interrupt-parent: specifies the parent interrupt controller. +- interrupt-controller: identifies the node as an interrupt controller +- msi-controller: identifies the node as an PCI Message Signaled Interrupt + controller +- al,msi-base-spi: SPI base of the MSI frame +- al,msi-num-spis: number of SPIs assigned to the MSI frame, relative to SPI0 + +Example: + +msix: msix { + compatible = "al,alpine-msix"; + reg = <0x0 0xfbe00000 0x0 0x100000>; + interrupt-parent = <&gic>; + interrupt-controller; + msi-controller; + al,msi-base-spi = <160>; + al,msi-num-spis = <160>; +}; diff --git a/Documentation/devicetree/bindings/interrupt-controller/arm,gic.txt b/Documentation/devicetree/bindings/interrupt-controller/arm,gic.txt index 5a1cb4bc3dfe..793c20ff8fcc 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/arm,gic.txt +++ b/Documentation/devicetree/bindings/interrupt-controller/arm,gic.txt @@ -16,6 +16,7 @@ Main node required properties: "arm,cortex-a15-gic" "arm,cortex-a7-gic" "arm,cortex-a9-gic" + "arm,eb11mp-gic" "arm,gic-400" "arm,pl390" "arm,tc11mp-gic" diff --git a/Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt b/Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt new file mode 100644 index 000000000000..8af0a8e613ab --- /dev/null +++ b/Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt @@ -0,0 +1,44 @@ + +* Marvell ODMI for MSI support + +Some Marvell SoCs have an On-Die Message Interrupt (ODMI) controller +which can be used by on-board peripheral for MSI interrupts. + +Required properties: + +- compatible : The value here should contain: + + "marvell,ap806-odmi-controller", "marvell,odmi-controller". + +- interrupt,controller : Identifies the node as an interrupt controller. + +- msi-controller : Identifies the node as an MSI controller. + +- marvell,odmi-frames : Number of ODMI frames available. Each frame + provides a number of events. + +- reg : List of register definitions, one for each + ODMI frame. + +- marvell,spi-base : List of GIC base SPI interrupts, one for each + ODMI frame. Those SPI interrupts are 0-based, + i.e marvell,spi-base = <128> will use SPI #96. + See Documentation/devicetree/bindings/interrupt-controller/arm,gic.txt + for details about the GIC Device Tree binding. + +- interrupt-parent : Reference to the parent interrupt controller. + +Example: + + odmi: odmi@300000 { + compatible = "marvell,ap806-odm-controller", + "marvell,odmi-controller"; + interrupt-controller; + msi-controller; + marvell,odmi-frames = <4>; + reg = <0x300000 0x4000>, + <0x304000 0x4000>, + <0x308000 0x4000>, + <0x30C000 0x4000>; + marvell,spi-base = <128>, <136>, <144>, <152>; + }; diff --git a/Documentation/devicetree/bindings/interrupt-controller/mips-gic.txt b/Documentation/devicetree/bindings/interrupt-controller/mips-gic.txt index aae4c384ee1f..173595305e26 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/mips-gic.txt +++ b/Documentation/devicetree/bindings/interrupt-controller/mips-gic.txt @@ -23,6 +23,12 @@ Optional properties: - mti,reserved-cpu-vectors : Specifies the list of CPU interrupt vectors to which the GIC may not route interrupts. Valid values are 2 - 7. This property is ignored if the CPU is started in EIC mode. +- mti,reserved-ipi-vectors : Specifies the range of GIC interrupts that are + reserved for IPIs. + It accepts 2 values, the 1st is the starting interrupt and the 2nd is the size + of the reserved range. + If not specified, the driver will allocate the last 2 * number of VPEs in the + system. Required properties for timer sub-node: - compatible : Should be "mti,gic-timer". @@ -44,6 +50,7 @@ Example: #interrupt-cells = <3>; mti,reserved-cpu-vectors = <7>; + mti,reserved-ipi-vectors = <40 8>; timer { compatible = "mti,gic-timer"; diff --git a/Documentation/devicetree/bindings/interrupt-controller/sigma,smp8642-intc.txt b/Documentation/devicetree/bindings/interrupt-controller/sigma,smp8642-intc.txt new file mode 100644 index 000000000000..1f441fa0ad40 --- /dev/null +++ b/Documentation/devicetree/bindings/interrupt-controller/sigma,smp8642-intc.txt @@ -0,0 +1,49 @@ +Sigma Designs SMP86xx/SMP87xx secondary interrupt controller + +Required properties: +- compatible: should be "sigma,smp8642-intc" +- reg: physical address of MMIO region +- ranges: address space mapping of child nodes +- interrupt-parent: phandle of parent interrupt controller +- interrupt-controller: boolean +- #address-cells: should be <1> +- #size-cells: should be <1> + +One child node per control block with properties: +- reg: address of registers for this control block +- interrupt-controller: boolean +- #interrupt-cells: should be <2>, interrupt index and flags per interrupts.txt +- interrupts: interrupt spec of primary interrupt controller + +Example: + +interrupt-controller@6e000 { + compatible = "sigma,smp8642-intc"; + reg = <0x6e000 0x400>; + ranges = <0x0 0x6e000 0x400>; + interrupt-parent = <&gic>; + interrupt-controller; + #address-cells = <1>; + #size-cells = <1>; + + irq0: interrupt-controller@0 { + reg = <0x000 0x100>; + interrupt-controller; + #interrupt-cells = <2>; + interrupts = ; + }; + + irq1: interrupt-controller@100 { + reg = <0x100 0x100>; + interrupt-controller; + #interrupt-cells = <2>; + interrupts = ; + }; + + irq2: interrupt-controller@300 { + reg = <0x300 0x100>; + interrupt-controller; + #interrupt-cells = <2>; + interrupts = ; + }; +}; diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 8ae47a7b4923..4d9ca7d92a20 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -666,7 +666,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. clearcpuid=BITNUM [X86] Disable CPUID feature X for the kernel. See - arch/x86/include/asm/cpufeature.h for the valid bit + arch/x86/include/asm/cpufeatures.h for the valid bit numbers. Note the Linux specific bits are not necessarily stable over kernel options, but the vendor specific ones should be. @@ -1687,6 +1687,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted. ip= [IP_PNP] See Documentation/filesystems/nfs/nfsroot.txt. + irqaffinity= [SMP] Set the default irq affinity mask + Format: + ,..., + or + - + (must be a positive range in ascending order) + or a mixture + ,...,- + irqfixup [HW] When an interrupt is not handled search all handlers for it. Intended to get systems with badly broken @@ -2566,6 +2575,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. nointroute [IA-64] + noinvpcid [X86] Disable the INVPCID cpu feature. + nojitter [IA-64] Disables jitter checking for ITC timers. no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver diff --git a/Documentation/ptp/testptp.c b/Documentation/ptp/testptp.c index 6c6247aaa7b9..d99012f41602 100644 --- a/Documentation/ptp/testptp.c +++ b/Documentation/ptp/testptp.c @@ -277,13 +277,15 @@ int main(int argc, char *argv[]) " %d external time stamp channels\n" " %d programmable periodic signals\n" " %d pulse per second\n" - " %d programmable pins\n", + " %d programmable pins\n" + " %d cross timestamping\n", caps.max_adj, caps.n_alarm, caps.n_ext_ts, caps.n_per_out, caps.pps, - caps.n_pins); + caps.n_pins, + caps.cross_timestamping); } } diff --git a/Documentation/x86/early-microcode.txt b/Documentation/x86/early-microcode.txt index d62bea6796da..c956d99cf1de 100644 --- a/Documentation/x86/early-microcode.txt +++ b/Documentation/x86/early-microcode.txt @@ -40,3 +40,28 @@ cp ../microcode.bin kernel/x86/microcode/GenuineIntel.bin (or AuthenticAMD.bin) find . | cpio -o -H newc >../ucode.cpio cd .. cat ucode.cpio /boot/initrd-3.5.0.img >/boot/initrd-3.5.0.ucode.img + +Builtin microcode +================= + +We can also load builtin microcode supplied through the regular firmware +builtin method CONFIG_FIRMWARE_IN_KERNEL. Here's an example: + +CONFIG_FIRMWARE_IN_KERNEL=y +CONFIG_EXTRA_FIRMWARE="intel-ucode/06-3a-09 amd-ucode/microcode_amd_fam15h.bin" +CONFIG_EXTRA_FIRMWARE_DIR="/lib/firmware" + +This basically means, you have the following tree structure locally: + +/lib/firmware/ +|-- amd-ucode +... +| |-- microcode_amd_fam15h.bin +... +|-- intel-ucode +... +| |-- 06-3a-09 +... + +so that the build system can find those files and integrate them into +the final kernel image. The early loader finds them and applies them. diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index 68ed3114c363..0965a71f9942 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt @@ -60,6 +60,8 @@ Machine check threshold to 1. Enabling this may make memory predictive failure analysis less effective if the bios sets thresholds for memory errors since we will not see details for all errors. + mce=recovery + Force-enable recoverable machine check code paths nomce (for compatibility with i386): same as mce=off diff --git a/MAINTAINERS b/MAINTAINERS index 2061ea77667c..57adf395a61f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2422,6 +2422,7 @@ F: arch/mips/bmips/* F: arch/mips/include/asm/mach-bmips/* F: arch/mips/kernel/*bmips* F: arch/mips/boot/dts/brcm/bcm*.dts* +F: drivers/irqchip/irq-bcm63* F: drivers/irqchip/irq-bcm7* F: drivers/irqchip/irq-brcmstb* F: include/linux/bcm963xx_nvram.h diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index 2f24447fef92..46bf263c3153 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -168,7 +168,7 @@ smp_callin(void) cpuid, current, current->active_mm)); preempt_disable(); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } /* Wait until hwrpb->txrdy is clear for cpu. Return -1 on timeout. */ diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c index 424e937da5c8..4cb3add77c75 100644 --- a/arch/arc/kernel/smp.c +++ b/arch/arc/kernel/smp.c @@ -142,7 +142,7 @@ void start_kernel_secondary(void) local_irq_enable(); preempt_disable(); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } /* diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 37312f6749f3..baee70267f29 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -409,7 +409,7 @@ asmlinkage void secondary_start_kernel(void) /* * OK, it's off to the idle thread for us */ - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } void __init smp_cpus_done(unsigned int max_cpus) diff --git a/arch/arm/mach-mvebu/Kconfig b/arch/arm/mach-mvebu/Kconfig index 64e3d2ce9a07..b003e3afd693 100644 --- a/arch/arm/mach-mvebu/Kconfig +++ b/arch/arm/mach-mvebu/Kconfig @@ -3,7 +3,6 @@ menuconfig ARCH_MVEBU depends on ARCH_MULTI_V7 || ARCH_MULTI_V5 select ARCH_SUPPORTS_BIG_ENDIAN select CLKSRC_MMIO - select GENERIC_IRQ_CHIP select PINCTRL select PLAT_ORION select SOC_BUS @@ -29,6 +28,7 @@ config MACH_ARMADA_370 bool "Marvell Armada 370 boards" depends on ARCH_MULTI_V7 select ARMADA_370_CLK + select ARMADA_370_XP_IRQ select CPU_PJ4B select MACH_MVEBU_V7 select PINCTRL_ARMADA_370 @@ -39,6 +39,7 @@ config MACH_ARMADA_370 config MACH_ARMADA_375 bool "Marvell Armada 375 boards" depends on ARCH_MULTI_V7 + select ARMADA_370_XP_IRQ select ARM_ERRATA_720789 select ARM_ERRATA_753970 select ARM_GIC @@ -58,6 +59,7 @@ config MACH_ARMADA_38X select ARM_ERRATA_720789 select ARM_ERRATA_753970 select ARM_GIC + select ARMADA_370_XP_IRQ select ARMADA_38X_CLK select HAVE_ARM_SCU select HAVE_ARM_TWD if SMP @@ -72,6 +74,7 @@ config MACH_ARMADA_39X bool "Marvell Armada 39x boards" depends on ARCH_MULTI_V7 select ARM_GIC + select ARMADA_370_XP_IRQ select ARMADA_39X_CLK select CACHE_L2X0 select HAVE_ARM_SCU @@ -86,6 +89,7 @@ config MACH_ARMADA_39X config MACH_ARMADA_XP bool "Marvell Armada XP boards" depends on ARCH_MULTI_V7 + select ARMADA_370_XP_IRQ select ARMADA_XP_CLK select CPU_PJ4B select MACH_MVEBU_V7 diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index b1adc51b2c2e..460765799c64 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -195,7 +195,7 @@ asmlinkage void secondary_start_kernel(void) /* * OK, it's off to the idle thread for us */ - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c index 0030e21cfceb..23c4ef5f8bdc 100644 --- a/arch/blackfin/mach-common/smp.c +++ b/arch/blackfin/mach-common/smp.c @@ -333,7 +333,7 @@ void secondary_start_kernel(void) /* We are done with local CPU inits, unblock the boot CPU. */ set_cpu_online(cpu, true); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } void __init smp_prepare_boot_cpu(void) diff --git a/arch/hexagon/kernel/smp.c b/arch/hexagon/kernel/smp.c index ff759f26b96a..983bae7d2665 100644 --- a/arch/hexagon/kernel/smp.c +++ b/arch/hexagon/kernel/smp.c @@ -180,7 +180,7 @@ void start_secondary(void) local_irq_enable(); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c index 0e76fad27975..74fe317477e6 100644 --- a/arch/ia64/kernel/smpboot.c +++ b/arch/ia64/kernel/smpboot.c @@ -454,7 +454,7 @@ start_secondary (void *unused) preempt_disable(); smp_callin(); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); return 0; } diff --git a/arch/m32r/kernel/smpboot.c b/arch/m32r/kernel/smpboot.c index a468467542f4..f98d2f6519d6 100644 --- a/arch/m32r/kernel/smpboot.c +++ b/arch/m32r/kernel/smpboot.c @@ -432,7 +432,7 @@ int __init start_secondary(void *unused) */ local_flush_tlb_all(); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); return 0; } diff --git a/arch/metag/kernel/smp.c b/arch/metag/kernel/smp.c index c3c6f0864881..bad13232de51 100644 --- a/arch/metag/kernel/smp.c +++ b/arch/metag/kernel/smp.c @@ -396,7 +396,7 @@ asmlinkage void secondary_start_kernel(void) /* * OK, it's off to the idle thread for us */ - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } void __init smp_cpus_done(unsigned int max_cpus) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index d3da79dda629..a65eacf59918 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -151,6 +151,7 @@ config BMIPS_GENERIC select CSRC_R4K select SYNC_R4K select COMMON_CLK + select BCM6345_L1_IRQ select BCM7038_L1_IRQ select BCM7120_L2_IRQ select BRCMSTB_L2_IRQ @@ -2169,7 +2170,6 @@ config MIPS_MT_SMP select CPU_MIPSR2_IRQ_VI select CPU_MIPSR2_IRQ_EI select SYNC_R4K - select MIPS_GIC_IPI if MIPS_GIC select MIPS_MT select SMP select SMP_UP @@ -2267,7 +2267,6 @@ config MIPS_VPE_APSP_API_MT config MIPS_CMP bool "MIPS CMP framework support (DEPRECATED)" depends on SYS_SUPPORTS_MIPS_CMP && !CPU_MIPSR6 - select MIPS_GIC_IPI if MIPS_GIC select SMP select SYNC_R4K select SYS_SUPPORTS_SMP @@ -2287,7 +2286,6 @@ config MIPS_CPS select MIPS_CM select MIPS_CPC select MIPS_CPS_PM if HOTPLUG_CPU - select MIPS_GIC_IPI if MIPS_GIC select SMP select SYNC_R4K if (CEVT_R4K || CSRC_R4K) select SYS_SUPPORTS_HOTPLUG_CPU @@ -2305,10 +2303,6 @@ config MIPS_CPS_PM select MIPS_CPC bool -config MIPS_GIC_IPI - depends on MIPS_GIC - bool - config MIPS_CM bool diff --git a/arch/mips/ath79/irq.c b/arch/mips/ath79/irq.c index 511c06560dc1..2dfff1f19004 100644 --- a/arch/mips/ath79/irq.c +++ b/arch/mips/ath79/irq.c @@ -26,90 +26,6 @@ #include "common.h" #include "machtypes.h" -static void __init ath79_misc_intc_domain_init( - struct device_node *node, int irq); - -static void ath79_misc_irq_handler(struct irq_desc *desc) -{ - struct irq_domain *domain = irq_desc_get_handler_data(desc); - void __iomem *base = domain->host_data; - u32 pending; - - pending = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_STATUS) & - __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE); - - if (!pending) { - spurious_interrupt(); - return; - } - - while (pending) { - int bit = __ffs(pending); - - generic_handle_irq(irq_linear_revmap(domain, bit)); - pending &= ~BIT(bit); - } -} - -static void ar71xx_misc_irq_unmask(struct irq_data *d) -{ - void __iomem *base = irq_data_get_irq_chip_data(d); - unsigned int irq = d->hwirq; - u32 t; - - t = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE); - __raw_writel(t | (1 << irq), base + AR71XX_RESET_REG_MISC_INT_ENABLE); - - /* flush write */ - __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE); -} - -static void ar71xx_misc_irq_mask(struct irq_data *d) -{ - void __iomem *base = irq_data_get_irq_chip_data(d); - unsigned int irq = d->hwirq; - u32 t; - - t = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE); - __raw_writel(t & ~(1 << irq), base + AR71XX_RESET_REG_MISC_INT_ENABLE); - - /* flush write */ - __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE); -} - -static void ar724x_misc_irq_ack(struct irq_data *d) -{ - void __iomem *base = irq_data_get_irq_chip_data(d); - unsigned int irq = d->hwirq; - u32 t; - - t = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_STATUS); - __raw_writel(t & ~(1 << irq), base + AR71XX_RESET_REG_MISC_INT_STATUS); - - /* flush write */ - __raw_readl(base + AR71XX_RESET_REG_MISC_INT_STATUS); -} - -static struct irq_chip ath79_misc_irq_chip = { - .name = "MISC", - .irq_unmask = ar71xx_misc_irq_unmask, - .irq_mask = ar71xx_misc_irq_mask, -}; - -static void __init ath79_misc_irq_init(void) -{ - if (soc_is_ar71xx() || soc_is_ar913x()) - ath79_misc_irq_chip.irq_mask_ack = ar71xx_misc_irq_mask; - else if (soc_is_ar724x() || - soc_is_ar933x() || - soc_is_ar934x() || - soc_is_qca955x()) - ath79_misc_irq_chip.irq_ack = ar724x_misc_irq_ack; - else - BUG(); - - ath79_misc_intc_domain_init(NULL, ATH79_CPU_IRQ(6)); -} static void ar934x_ip2_irq_dispatch(struct irq_desc *desc) { @@ -212,142 +128,12 @@ static void qca955x_irq_init(void) irq_set_chained_handler(ATH79_CPU_IRQ(3), qca955x_ip3_irq_dispatch); } -/* - * The IP2/IP3 lines are tied to a PCI/WMAC/USB device. Drivers for - * these devices typically allocate coherent DMA memory, however the - * DMA controller may still have some unsynchronized data in the FIFO. - * Issue a flush in the handlers to ensure that the driver sees - * the update. - * - * This array map the interrupt lines to the DDR write buffer channels. - */ - -static unsigned irq_wb_chan[8] = { - -1, -1, -1, -1, -1, -1, -1, -1, -}; - -asmlinkage void plat_irq_dispatch(void) -{ - unsigned long pending; - int irq; - - pending = read_c0_status() & read_c0_cause() & ST0_IM; - - if (!pending) { - spurious_interrupt(); - return; - } - - pending >>= CAUSEB_IP; - while (pending) { - irq = fls(pending) - 1; - if (irq < ARRAY_SIZE(irq_wb_chan) && irq_wb_chan[irq] != -1) - ath79_ddr_wb_flush(irq_wb_chan[irq]); - do_IRQ(MIPS_CPU_IRQ_BASE + irq); - pending &= ~BIT(irq); - } -} - -static int misc_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hw) -{ - irq_set_chip_and_handler(irq, &ath79_misc_irq_chip, handle_level_irq); - irq_set_chip_data(irq, d->host_data); - return 0; -} - -static const struct irq_domain_ops misc_irq_domain_ops = { - .xlate = irq_domain_xlate_onecell, - .map = misc_map, -}; - -static void __init ath79_misc_intc_domain_init( - struct device_node *node, int irq) -{ - void __iomem *base = ath79_reset_base; - struct irq_domain *domain; - - domain = irq_domain_add_legacy(node, ATH79_MISC_IRQ_COUNT, - ATH79_MISC_IRQ_BASE, 0, &misc_irq_domain_ops, base); - if (!domain) - panic("Failed to add MISC irqdomain"); - - /* Disable and clear all interrupts */ - __raw_writel(0, base + AR71XX_RESET_REG_MISC_INT_ENABLE); - __raw_writel(0, base + AR71XX_RESET_REG_MISC_INT_STATUS); - - irq_set_chained_handler_and_data(irq, ath79_misc_irq_handler, domain); -} - -static int __init ath79_misc_intc_of_init( - struct device_node *node, struct device_node *parent) -{ - int irq; - - irq = irq_of_parse_and_map(node, 0); - if (!irq) - panic("Failed to get MISC IRQ"); - - ath79_misc_intc_domain_init(node, irq); - return 0; -} - -static int __init ar7100_misc_intc_of_init( - struct device_node *node, struct device_node *parent) -{ - ath79_misc_irq_chip.irq_mask_ack = ar71xx_misc_irq_mask; - return ath79_misc_intc_of_init(node, parent); -} - -IRQCHIP_DECLARE(ar7100_misc_intc, "qca,ar7100-misc-intc", - ar7100_misc_intc_of_init); - -static int __init ar7240_misc_intc_of_init( - struct device_node *node, struct device_node *parent) -{ - ath79_misc_irq_chip.irq_ack = ar724x_misc_irq_ack; - return ath79_misc_intc_of_init(node, parent); -} - -IRQCHIP_DECLARE(ar7240_misc_intc, "qca,ar7240-misc-intc", - ar7240_misc_intc_of_init); - -static int __init ar79_cpu_intc_of_init( - struct device_node *node, struct device_node *parent) -{ - int err, i, count; - - /* Fill the irq_wb_chan table */ - count = of_count_phandle_with_args( - node, "qca,ddr-wb-channels", "#qca,ddr-wb-channel-cells"); - - for (i = 0; i < count; i++) { - struct of_phandle_args args; - u32 irq = i; - - of_property_read_u32_index( - node, "qca,ddr-wb-channel-interrupts", i, &irq); - if (irq >= ARRAY_SIZE(irq_wb_chan)) - continue; - - err = of_parse_phandle_with_args( - node, "qca,ddr-wb-channels", - "#qca,ddr-wb-channel-cells", - i, &args); - if (err) - return err; - - irq_wb_chan[irq] = args.args[0]; - pr_info("IRQ: Set flush channel of IRQ%d to %d\n", - irq, args.args[0]); - } - - return mips_cpu_irq_of_init(node, parent); -} -IRQCHIP_DECLARE(ar79_cpu_intc, "qca,ar7100-cpu-intc", - ar79_cpu_intc_of_init); - void __init arch_init_irq(void) { + unsigned irq_wb_chan2 = -1; + unsigned irq_wb_chan3 = -1; + bool misc_is_ar71xx; + if (mips_machtype == ATH79_MACH_GENERIC_OF) { irqchip_init(); return; @@ -355,14 +141,26 @@ void __init arch_init_irq(void) if (soc_is_ar71xx() || soc_is_ar724x() || soc_is_ar913x() || soc_is_ar933x()) { - irq_wb_chan[2] = 3; - irq_wb_chan[3] = 2; + irq_wb_chan2 = 3; + irq_wb_chan3 = 2; } else if (soc_is_ar934x()) { - irq_wb_chan[3] = 2; + irq_wb_chan3 = 2; } - mips_cpu_irq_init(); - ath79_misc_irq_init(); + ath79_cpu_irq_init(irq_wb_chan2, irq_wb_chan3); + + if (soc_is_ar71xx() || soc_is_ar913x()) + misc_is_ar71xx = true; + else if (soc_is_ar724x() || + soc_is_ar933x() || + soc_is_ar934x() || + soc_is_qca955x()) + misc_is_ar71xx = false; + else + BUG(); + ath79_misc_irq_init( + ath79_reset_base + AR71XX_RESET_REG_MISC_INT_STATUS, + ATH79_CPU_IRQ(6), ATH79_MISC_IRQ_BASE, misc_is_ar71xx); if (soc_is_ar934x()) ar934x_ip2_irq_init(); diff --git a/arch/mips/bmips/irq.c b/arch/mips/bmips/irq.c index e7fc6f9348ba..7efefcf44033 100644 --- a/arch/mips/bmips/irq.c +++ b/arch/mips/bmips/irq.c @@ -15,6 +15,12 @@ #include #include +static const struct of_device_id smp_intc_dt_match[] = { + { .compatible = "brcm,bcm7038-l1-intc" }, + { .compatible = "brcm,bcm6345-l1-intc" }, + {} +}; + unsigned int get_c0_compare_int(void) { return CP0_LEGACY_COMPARE_IRQ; @@ -24,8 +30,8 @@ void __init arch_init_irq(void) { struct device_node *dn; - /* Only the STB (bcm7038) controller supports SMP IRQ affinity */ - dn = of_find_compatible_node(NULL, NULL, "brcm,bcm7038-l1-intc"); + /* Only these controllers support SMP IRQ affinity */ + dn = of_find_matching_node(NULL, smp_intc_dt_match); if (dn) of_node_put(dn); else diff --git a/arch/mips/include/asm/mach-ath79/ath79.h b/arch/mips/include/asm/mach-ath79/ath79.h index 2b3487213d1e..441faa92c3cd 100644 --- a/arch/mips/include/asm/mach-ath79/ath79.h +++ b/arch/mips/include/asm/mach-ath79/ath79.h @@ -144,4 +144,8 @@ static inline u32 ath79_reset_rr(unsigned reg) void ath79_device_reset_set(u32 mask); void ath79_device_reset_clear(u32 mask); +void ath79_cpu_irq_init(unsigned irq_wb_chan2, unsigned irq_wb_chan3); +void ath79_misc_irq_init(void __iomem *regs, int irq, + int irq_base, bool is_ar71xx); + #endif /* __ASM_MACH_ATH79_H */ diff --git a/arch/mips/include/asm/smp-ops.h b/arch/mips/include/asm/smp-ops.h index 6ba1fb8b11e2..db7c322f057f 100644 --- a/arch/mips/include/asm/smp-ops.h +++ b/arch/mips/include/asm/smp-ops.h @@ -44,8 +44,9 @@ static inline void plat_smp_setup(void) mp_ops->smp_setup(); } -extern void gic_send_ipi_single(int cpu, unsigned int action); -extern void gic_send_ipi_mask(const struct cpumask *mask, unsigned int action); +extern void mips_smp_send_ipi_single(int cpu, unsigned int action); +extern void mips_smp_send_ipi_mask(const struct cpumask *mask, + unsigned int action); #else /* !CONFIG_SMP */ diff --git a/arch/mips/kernel/Makefile b/arch/mips/kernel/Makefile index 68e2b7db9348..b0988fd62fcc 100644 --- a/arch/mips/kernel/Makefile +++ b/arch/mips/kernel/Makefile @@ -52,7 +52,6 @@ obj-$(CONFIG_MIPS_MT_SMP) += smp-mt.o obj-$(CONFIG_MIPS_CMP) += smp-cmp.o obj-$(CONFIG_MIPS_CPS) += smp-cps.o cps-vec.o obj-$(CONFIG_MIPS_CPS_NS16550) += cps-vec-ns16550.o -obj-$(CONFIG_MIPS_GIC_IPI) += smp-gic.o obj-$(CONFIG_MIPS_SPRAM) += spram.o obj-$(CONFIG_MIPS_VPE_LOADER) += vpe.o diff --git a/arch/mips/kernel/smp-cmp.c b/arch/mips/kernel/smp-cmp.c index d5e0f949dc48..76923349b4fe 100644 --- a/arch/mips/kernel/smp-cmp.c +++ b/arch/mips/kernel/smp-cmp.c @@ -149,8 +149,8 @@ void __init cmp_prepare_cpus(unsigned int max_cpus) } struct plat_smp_ops cmp_smp_ops = { - .send_ipi_single = gic_send_ipi_single, - .send_ipi_mask = gic_send_ipi_mask, + .send_ipi_single = mips_smp_send_ipi_single, + .send_ipi_mask = mips_smp_send_ipi_mask, .init_secondary = cmp_init_secondary, .smp_finish = cmp_smp_finish, .boot_secondary = cmp_boot_secondary, diff --git a/arch/mips/kernel/smp-cps.c b/arch/mips/kernel/smp-cps.c index 2ad4e4c96d61..253e1409338c 100644 --- a/arch/mips/kernel/smp-cps.c +++ b/arch/mips/kernel/smp-cps.c @@ -472,8 +472,8 @@ static struct plat_smp_ops cps_smp_ops = { .boot_secondary = cps_boot_secondary, .init_secondary = cps_init_secondary, .smp_finish = cps_smp_finish, - .send_ipi_single = gic_send_ipi_single, - .send_ipi_mask = gic_send_ipi_mask, + .send_ipi_single = mips_smp_send_ipi_single, + .send_ipi_mask = mips_smp_send_ipi_mask, #ifdef CONFIG_HOTPLUG_CPU .cpu_disable = cps_cpu_disable, .cpu_die = cps_cpu_die, diff --git a/arch/mips/kernel/smp-mt.c b/arch/mips/kernel/smp-mt.c index 86311a164ef1..4f9570a57e8d 100644 --- a/arch/mips/kernel/smp-mt.c +++ b/arch/mips/kernel/smp-mt.c @@ -121,7 +121,7 @@ static void vsmp_send_ipi_single(int cpu, unsigned int action) #ifdef CONFIG_MIPS_GIC if (gic_present) { - gic_send_ipi_single(cpu, action); + mips_smp_send_ipi_single(cpu, action); return; } #endif diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c index 2b521e07b860..37708d9af638 100644 --- a/arch/mips/kernel/smp.c +++ b/arch/mips/kernel/smp.c @@ -33,12 +33,16 @@ #include #include #include +#include +#include +#include #include #include #include #include #include +#include #include #include #include @@ -79,6 +83,11 @@ static cpumask_t cpu_core_setup_map; cpumask_t cpu_coherent_mask; +#ifdef CONFIG_GENERIC_IRQ_IPI +static struct irq_desc *call_desc; +static struct irq_desc *sched_desc; +#endif + static inline void set_cpu_sibling_map(int cpu) { int i; @@ -146,6 +155,133 @@ void register_smp_ops(struct plat_smp_ops *ops) mp_ops = ops; } +#ifdef CONFIG_GENERIC_IRQ_IPI +void mips_smp_send_ipi_single(int cpu, unsigned int action) +{ + mips_smp_send_ipi_mask(cpumask_of(cpu), action); +} + +void mips_smp_send_ipi_mask(const struct cpumask *mask, unsigned int action) +{ + unsigned long flags; + unsigned int core; + int cpu; + + local_irq_save(flags); + + switch (action) { + case SMP_CALL_FUNCTION: + __ipi_send_mask(call_desc, mask); + break; + + case SMP_RESCHEDULE_YOURSELF: + __ipi_send_mask(sched_desc, mask); + break; + + default: + BUG(); + } + + if (mips_cpc_present()) { + for_each_cpu(cpu, mask) { + core = cpu_data[cpu].core; + + if (core == current_cpu_data.core) + continue; + + while (!cpumask_test_cpu(cpu, &cpu_coherent_mask)) { + mips_cpc_lock_other(core); + write_cpc_co_cmd(CPC_Cx_CMD_PWRUP); + mips_cpc_unlock_other(); + } + } + } + + local_irq_restore(flags); +} + + +static irqreturn_t ipi_resched_interrupt(int irq, void *dev_id) +{ + scheduler_ipi(); + + return IRQ_HANDLED; +} + +static irqreturn_t ipi_call_interrupt(int irq, void *dev_id) +{ + generic_smp_call_function_interrupt(); + + return IRQ_HANDLED; +} + +static struct irqaction irq_resched = { + .handler = ipi_resched_interrupt, + .flags = IRQF_PERCPU, + .name = "IPI resched" +}; + +static struct irqaction irq_call = { + .handler = ipi_call_interrupt, + .flags = IRQF_PERCPU, + .name = "IPI call" +}; + +static __init void smp_ipi_init_one(unsigned int virq, + struct irqaction *action) +{ + int ret; + + irq_set_handler(virq, handle_percpu_irq); + ret = setup_irq(virq, action); + BUG_ON(ret); +} + +static int __init mips_smp_ipi_init(void) +{ + unsigned int call_virq, sched_virq; + struct irq_domain *ipidomain; + struct device_node *node; + + node = of_irq_find_parent(of_root); + ipidomain = irq_find_matching_host(node, DOMAIN_BUS_IPI); + + /* + * Some platforms have half DT setup. So if we found irq node but + * didn't find an ipidomain, try to search for one that is not in the + * DT. + */ + if (node && !ipidomain) + ipidomain = irq_find_matching_host(NULL, DOMAIN_BUS_IPI); + + BUG_ON(!ipidomain); + + call_virq = irq_reserve_ipi(ipidomain, cpu_possible_mask); + BUG_ON(!call_virq); + + sched_virq = irq_reserve_ipi(ipidomain, cpu_possible_mask); + BUG_ON(!sched_virq); + + if (irq_domain_is_ipi_per_cpu(ipidomain)) { + int cpu; + + for_each_cpu(cpu, cpu_possible_mask) { + smp_ipi_init_one(call_virq + cpu, &irq_call); + smp_ipi_init_one(sched_virq + cpu, &irq_resched); + } + } else { + smp_ipi_init_one(call_virq, &irq_call); + smp_ipi_init_one(sched_virq, &irq_resched); + } + + call_desc = irq_to_desc(call_virq); + sched_desc = irq_to_desc(sched_virq); + + return 0; +} +early_initcall(mips_smp_ipi_init); +#endif + /* * First C code run on the secondary CPUs after being started up by * the master. @@ -192,7 +328,7 @@ asmlinkage void start_secondary(void) WARN_ON_ONCE(!irqs_disabled()); mp_ops->smp_finish(); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } static void stop_this_cpu(void *dummy) diff --git a/arch/mn10300/kernel/smp.c b/arch/mn10300/kernel/smp.c index f984193718b1..426173c4b0b9 100644 --- a/arch/mn10300/kernel/smp.c +++ b/arch/mn10300/kernel/smp.c @@ -675,7 +675,7 @@ int __init start_secondary(void *unused) #ifdef CONFIG_GENERIC_CLOCKEVENTS init_clockevents(); #endif - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); return 0; } diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c index 52e85973a283..c2a9cc55a62f 100644 --- a/arch/parisc/kernel/smp.c +++ b/arch/parisc/kernel/smp.c @@ -305,7 +305,7 @@ void __init smp_callin(void) local_irq_enable(); /* Interrupts have been off until now */ - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); /* NOTREACHED */ panic("smp_callin() AAAAaaaaahhhh....\n"); diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index ec9ec2058d2d..cc13d4c83291 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -727,7 +727,7 @@ void start_secondary(void *unused) local_irq_enable(); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); BUG(); } diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 3c65a8eae34d..40a6b4f9c36c 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -798,7 +798,7 @@ static void smp_start_secondary(void *cpuvoid) set_cpu_online(smp_processor_id(), true); inc_irq_stat(CPU_RST); local_irq_enable(); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } /* Upping and downing of CPUs */ diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c index de6be008fc01..13f633add29a 100644 --- a/arch/sh/kernel/smp.c +++ b/arch/sh/kernel/smp.c @@ -203,7 +203,7 @@ asmlinkage void start_secondary(void) set_cpu_online(cpu, true); per_cpu(cpu_state, cpu) = CPU_ONLINE; - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } extern struct { diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c index b3a5d81b20f0..fb30e7c6a5b1 100644 --- a/arch/sparc/kernel/smp_32.c +++ b/arch/sparc/kernel/smp_32.c @@ -364,7 +364,7 @@ static void sparc_start_secondary(void *arg) local_irq_enable(); wmb(); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); /* We should never reach here! */ BUG(); diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index 19cd08d18672..8a6151a628ce 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -134,7 +134,7 @@ void smp_callin(void) local_irq_enable(); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } void cpu_panic(void) diff --git a/arch/tile/kernel/smpboot.c b/arch/tile/kernel/smpboot.c index 20d52a98e171..6c0abaacec33 100644 --- a/arch/tile/kernel/smpboot.c +++ b/arch/tile/kernel/smpboot.c @@ -208,7 +208,7 @@ void online_secondary(void) /* Set up tile-timer clock-event device on this cpu */ setup_tile_timer(); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } int __cpu_up(unsigned int cpu, struct task_struct *tidle) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b1051057e5b0..8f2e6659281b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1163,22 +1163,23 @@ config MICROCODE bool "CPU microcode loading support" default y depends on CPU_SUP_AMD || CPU_SUP_INTEL - depends on BLK_DEV_INITRD select FW_LOADER ---help--- - If you say Y here, you will be able to update the microcode on - certain Intel and AMD processors. The Intel support is for the - IA32 family, e.g. Pentium Pro, Pentium II, Pentium III, Pentium 4, - Xeon etc. The AMD support is for families 0x10 and later. You will - obviously need the actual microcode binary data itself which is not - shipped with the Linux kernel. + Intel and AMD processors. The Intel support is for the IA32 family, + e.g. Pentium Pro, Pentium II, Pentium III, Pentium 4, Xeon etc. The + AMD support is for families 0x10 and later. You will obviously need + the actual microcode binary data itself which is not shipped with + the Linux kernel. - This option selects the general module only, you need to select - at least one vendor specific module as well. + The preferred method to load microcode from a detached initrd is described + in Documentation/x86/early-microcode.txt. For that you need to enable + CONFIG_BLK_DEV_INITRD in order for the loader to be able to scan the + initrd for microcode blobs. - To compile this driver as a module, choose M here: the module - will be called microcode. + In addition, you can build-in the microcode into the kernel. For that you + need to enable FIRMWARE_IN_KERNEL and add the vendor-supplied microcode + to the CONFIG_EXTRA_FIRMWARE config option. config MICROCODE_INTEL bool "Intel microcode loading support" diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 7816b7b276f4..67eec55093a5 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -338,16 +338,6 @@ config DEBUG_IMR_SELFTEST If unsure say N here. -config X86_DEBUG_STATIC_CPU_HAS - bool "Debug alternatives" - depends on DEBUG_KERNEL - ---help--- - This option causes additional code to be generated which - fails if static_cpu_has() is used before alternatives have - run. - - If unsure, say N. - config X86_DEBUG_FPU bool "Debug the x86 FPU code" depends on DEBUG_KERNEL diff --git a/arch/x86/boot/cpuflags.h b/arch/x86/boot/cpuflags.h index ea97697e51e4..4cb404fd45ce 100644 --- a/arch/x86/boot/cpuflags.h +++ b/arch/x86/boot/cpuflags.h @@ -1,7 +1,7 @@ #ifndef BOOT_CPUFLAGS_H #define BOOT_CPUFLAGS_H -#include +#include #include struct cpu_features { diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c index 637097e66a62..f72498dc90d2 100644 --- a/arch/x86/boot/mkcpustr.c +++ b/arch/x86/boot/mkcpustr.c @@ -17,7 +17,7 @@ #include "../include/asm/required-features.h" #include "../include/asm/disabled-features.h" -#include "../include/asm/cpufeature.h" +#include "../include/asm/cpufeatures.h" #include "../kernel/cpu/capflags.c" int main(void) diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index a7661c430cd9..0702d2531bc7 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -49,7 +49,6 @@ typedef unsigned int u32; /* This must be large enough to hold the entire setup */ u8 buf[SETUP_SECT_MAX*512]; -int is_big_kernel; #define PECOFF_RELOC_RESERVE 0x20 diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 028be48c8839..e25a1630320c 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -288,7 +288,7 @@ CONFIG_NLS_ISO8859_1=y CONFIG_NLS_UTF8=y CONFIG_PRINTK_TIME=y # CONFIG_ENABLE_WARN_DEPRECATED is not set -CONFIG_FRAME_WARN=2048 +CONFIG_FRAME_WARN=1024 CONFIG_MAGIC_SYSRQ=y # CONFIG_UNUSED_SYMBOLS is not set CONFIG_DEBUG_KERNEL=y diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c index 07d2c6c86a54..27226df3f7d8 100644 --- a/arch/x86/crypto/crc32-pclmul_glue.c +++ b/arch/x86/crypto/crc32-pclmul_glue.c @@ -33,7 +33,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c index 0e9871693f24..0857b1a1de3b 100644 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -30,7 +30,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c index a3fcfc97a311..cd4df9322501 100644 --- a/arch/x86/crypto/crct10dif-pclmul_glue.c +++ b/arch/x86/crypto/crct10dif-pclmul_glue.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include asmlinkage __u16 crc_t10dif_pcl(__u16 crc, const unsigned char *buf, diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index e32206e09868..9a9e5884066c 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -201,37 +201,6 @@ For 32-bit we have the following conventions - kernel is built with .byte 0xf1 .endm -#else /* CONFIG_X86_64 */ - -/* - * For 32bit only simplified versions of SAVE_ALL/RESTORE_ALL. These - * are different from the entry_32.S versions in not changing the segment - * registers. So only suitable for in kernel use, not when transitioning - * from or to user space. The resulting stack frame is not a standard - * pt_regs frame. The main use case is calling C code from assembler - * when all the registers need to be preserved. - */ - - .macro SAVE_ALL - pushl %eax - pushl %ebp - pushl %edi - pushl %esi - pushl %edx - pushl %ecx - pushl %ebx - .endm - - .macro RESTORE_ALL - popl %ebx - popl %ecx - popl %edx - popl %esi - popl %edi - popl %ebp - popl %eax - .endm - #endif /* CONFIG_X86_64 */ /* diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 03663740c866..e79d93d44ecd 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -26,6 +26,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -44,6 +45,8 @@ __visible void enter_from_user_mode(void) CT_WARN_ON(ct_state() != CONTEXT_USER); user_exit(); } +#else +static inline void enter_from_user_mode(void) {} #endif static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) @@ -84,17 +87,6 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY; -#ifdef CONFIG_CONTEXT_TRACKING - /* - * If TIF_NOHZ is set, we are required to call user_exit() before - * doing anything that could touch RCU. - */ - if (work & _TIF_NOHZ) { - enter_from_user_mode(); - work &= ~_TIF_NOHZ; - } -#endif - #ifdef CONFIG_SECCOMP /* * Do seccomp first -- it should minimize exposure of other @@ -171,16 +163,6 @@ long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch, if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) BUG_ON(regs != task_pt_regs(current)); - /* - * If we stepped into a sysenter/syscall insn, it trapped in - * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. - * If user-mode had set TF itself, then it's still clear from - * do_debug() and we need to set it again to restore the user - * state. If we entered on the slow path, TF was already set. - */ - if (work & _TIF_SINGLESTEP) - regs->flags |= X86_EFLAGS_TF; - #ifdef CONFIG_SECCOMP /* * Call seccomp_phase2 before running the other hooks so that @@ -268,6 +250,7 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) /* Called with IRQs disabled. */ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) { + struct thread_info *ti = pt_regs_to_thread_info(regs); u32 cached_flags; if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled())) @@ -275,12 +258,22 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) lockdep_sys_exit(); - cached_flags = - READ_ONCE(pt_regs_to_thread_info(regs)->flags); + cached_flags = READ_ONCE(ti->flags); if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) exit_to_usermode_loop(regs, cached_flags); +#ifdef CONFIG_COMPAT + /* + * Compat syscalls set TS_COMPAT. Make sure we clear it before + * returning to user mode. We need to clear it *after* signal + * handling, because syscall restart has a fixup for compat + * syscalls. The fixup is exercised by the ptrace_syscall_32 + * selftest. + */ + ti->status &= ~TS_COMPAT; +#endif + user_enter(); } @@ -332,33 +325,45 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs) if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS)) syscall_slow_exit_work(regs, cached_flags); -#ifdef CONFIG_COMPAT - /* - * Compat syscalls set TS_COMPAT. Make sure we clear it before - * returning to user mode. - */ - ti->status &= ~TS_COMPAT; -#endif - local_irq_disable(); prepare_exit_to_usermode(regs); } +#ifdef CONFIG_X86_64 +__visible void do_syscall_64(struct pt_regs *regs) +{ + struct thread_info *ti = pt_regs_to_thread_info(regs); + unsigned long nr = regs->orig_ax; + + enter_from_user_mode(); + local_irq_enable(); + + if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) + nr = syscall_trace_enter(regs); + + /* + * NB: Native and x32 syscalls are dispatched from the same + * table. The only functional difference is the x32 bit in + * regs->orig_ax, which changes the behavior of some syscalls. + */ + if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) { + regs->ax = sys_call_table[nr & __SYSCALL_MASK]( + regs->di, regs->si, regs->dx, + regs->r10, regs->r8, regs->r9); + } + + syscall_return_slowpath(regs); +} +#endif + #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) /* - * Does a 32-bit syscall. Called with IRQs on and does all entry and - * exit work and returns with IRQs off. This function is extremely hot - * in workloads that use it, and it's usually called from + * Does a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. Does + * all entry and exit work and returns with IRQs off. This function is + * extremely hot in workloads that use it, and it's usually called from * do_fast_syscall_32, so forcibly inline it to improve performance. */ -#ifdef CONFIG_X86_32 -/* 32-bit kernels use a trap gate for INT80, and the asm code calls here. */ -__visible -#else -/* 64-bit kernels use do_syscall_32_irqs_off() instead. */ -static -#endif -__always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) +static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) { struct thread_info *ti = pt_regs_to_thread_info(regs); unsigned int nr = (unsigned int)regs->orig_ax; @@ -393,14 +398,13 @@ __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) syscall_return_slowpath(regs); } -#ifdef CONFIG_X86_64 -/* Handles INT80 on 64-bit kernels */ -__visible void do_syscall_32_irqs_off(struct pt_regs *regs) +/* Handles int $0x80 */ +__visible void do_int80_syscall_32(struct pt_regs *regs) { + enter_from_user_mode(); local_irq_enable(); do_syscall_32_irqs_on(regs); } -#endif /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ __visible long do_fast_syscall_32(struct pt_regs *regs) @@ -420,12 +424,11 @@ __visible long do_fast_syscall_32(struct pt_regs *regs) */ regs->ip = landing_pad; - /* - * Fetch EBP from where the vDSO stashed it. - * - * WARNING: We are in CONTEXT_USER and RCU isn't paying attention! - */ + enter_from_user_mode(); + local_irq_enable(); + + /* Fetch EBP from where the vDSO stashed it. */ if ( #ifdef CONFIG_X86_64 /* @@ -443,9 +446,6 @@ __visible long do_fast_syscall_32(struct pt_regs *regs) /* User code screwed up. */ local_irq_disable(); regs->ax = -EFAULT; -#ifdef CONFIG_CONTEXT_TRACKING - enter_from_user_mode(); -#endif prepare_exit_to_usermode(regs); return 0; /* Keep it simple: use IRET. */ } diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index bb3e376d0f33..10868aa734dc 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -40,7 +40,7 @@ #include #include #include -#include +#include #include #include #include @@ -287,20 +287,93 @@ need_resched: END(resume_kernel) #endif - # SYSENTER call handler stub +GLOBAL(__begin_SYSENTER_singlestep_region) +/* + * All code from here through __end_SYSENTER_singlestep_region is subject + * to being single-stepped if a user program sets TF and executes SYSENTER. + * There is absolutely nothing that we can do to prevent this from happening + * (thanks Intel!). To keep our handling of this situation as simple as + * possible, we handle TF just like AC and NT, except that our #DB handler + * will ignore all of the single-step traps generated in this range. + */ + +#ifdef CONFIG_XEN +/* + * Xen doesn't set %esp to be precisely what the normal SYSENTER + * entry point expects, so fix it up before using the normal path. + */ +ENTRY(xen_sysenter_target) + addl $5*4, %esp /* remove xen-provided frame */ + jmp sysenter_past_esp +#endif + +/* + * 32-bit SYSENTER entry. + * + * 32-bit system calls through the vDSO's __kernel_vsyscall enter here + * if X86_FEATURE_SEP is available. This is the preferred system call + * entry on 32-bit systems. + * + * The SYSENTER instruction, in principle, should *only* occur in the + * vDSO. In practice, a small number of Android devices were shipped + * with a copy of Bionic that inlined a SYSENTER instruction. This + * never happened in any of Google's Bionic versions -- it only happened + * in a narrow range of Intel-provided versions. + * + * SYSENTER loads SS, ESP, CS, and EIP from previously programmed MSRs. + * IF and VM in RFLAGS are cleared (IOW: interrupts are off). + * SYSENTER does not save anything on the stack, + * and does not save old EIP (!!!), ESP, or EFLAGS. + * + * To avoid losing track of EFLAGS.VM (and thus potentially corrupting + * user and/or vm86 state), we explicitly disable the SYSENTER + * instruction in vm86 mode by reprogramming the MSRs. + * + * Arguments: + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp user stack + * 0(%ebp) arg6 + */ ENTRY(entry_SYSENTER_32) movl TSS_sysenter_sp0(%esp), %esp sysenter_past_esp: pushl $__USER_DS /* pt_regs->ss */ pushl %ebp /* pt_regs->sp (stashed in bp) */ pushfl /* pt_regs->flags (except IF = 0) */ - ASM_CLAC /* Clear AC after saving FLAGS */ orl $X86_EFLAGS_IF, (%esp) /* Fix IF */ pushl $__USER_CS /* pt_regs->cs */ pushl $0 /* pt_regs->ip = 0 (placeholder) */ pushl %eax /* pt_regs->orig_ax */ SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ + /* + * SYSENTER doesn't filter flags, so we need to clear NT, AC + * and TF ourselves. To save a few cycles, we can check whether + * either was set instead of doing an unconditional popfq. + * This needs to happen before enabling interrupts so that + * we don't get preempted with NT set. + * + * If TF is set, we will single-step all the way to here -- do_debug + * will ignore all the traps. (Yes, this is slow, but so is + * single-stepping in general. This allows us to avoid having + * a more complicated code to handle the case where a user program + * forces us to single-step through the SYSENTER entry code.) + * + * NB.: .Lsysenter_fix_flags is a label with the code under it moved + * out-of-line as an optimization: NT is unlikely to be set in the + * majority of the cases and instead of polluting the I$ unnecessarily, + * we're keeping that code behind a branch which will predict as + * not-taken and therefore its instructions won't be fetched. + */ + testl $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, PT_EFLAGS(%esp) + jnz .Lsysenter_fix_flags +.Lsysenter_flags_fixed: + /* * User mode is traced as though IRQs are on, and SYSENTER * turned them off. @@ -326,6 +399,15 @@ sysenter_past_esp: popl %ebp /* pt_regs->bp */ popl %eax /* pt_regs->ax */ + /* + * Restore all flags except IF. (We restore IF separately because + * STI gives a one-instruction window in which we won't be interrupted, + * whereas POPF does not.) + */ + addl $PT_EFLAGS-PT_DS, %esp /* point esp at pt_regs->flags */ + btr $X86_EFLAGS_IF_BIT, (%esp) + popfl + /* * Return back to the vDSO, which will pop ecx and edx. * Don't bother with DS and ES (they already contain __USER_DS). @@ -339,28 +421,63 @@ sysenter_past_esp: .popsection _ASM_EXTABLE(1b, 2b) PTGS_TO_GS_EX + +.Lsysenter_fix_flags: + pushl $X86_EFLAGS_FIXED + popfl + jmp .Lsysenter_flags_fixed +GLOBAL(__end_SYSENTER_singlestep_region) ENDPROC(entry_SYSENTER_32) - # system call handler stub +/* + * 32-bit legacy system call entry. + * + * 32-bit x86 Linux system calls traditionally used the INT $0x80 + * instruction. INT $0x80 lands here. + * + * This entry point can be used by any 32-bit perform system calls. + * Instances of INT $0x80 can be found inline in various programs and + * libraries. It is also used by the vDSO's __kernel_vsyscall + * fallback for hardware that doesn't support a faster entry method. + * Restarted 32-bit system calls also fall back to INT $0x80 + * regardless of what instruction was originally used to do the system + * call. (64-bit programs can use INT $0x80 as well, but they can + * only run on 64-bit kernels and therefore land in + * entry_INT80_compat.) + * + * This is considered a slow path. It is not used by most libc + * implementations on modern hardware except during process startup. + * + * Arguments: + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp arg6 + */ ENTRY(entry_INT80_32) ASM_CLAC pushl %eax /* pt_regs->orig_ax */ SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ /* - * User mode is traced as though IRQs are on. Unlike the 64-bit - * case, INT80 is a trap gate on 32-bit kernels, so interrupts - * are already on (unless user code is messing around with iopl). + * User mode is traced as though IRQs are on, and the interrupt gate + * turned them off. */ + TRACE_IRQS_OFF movl %esp, %eax - call do_syscall_32_irqs_on + call do_int80_syscall_32 .Lsyscall_32_done: restore_all: TRACE_IRQS_IRET restore_all_notrace: #ifdef CONFIG_X86_ESPFIX32 + ALTERNATIVE "jmp restore_nocheck", "", X86_BUG_ESPFIX + movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS /* * Warning: PT_OLDSS(%esp) contains the wrong/random values if we @@ -387,19 +504,6 @@ ENTRY(iret_exc ) #ifdef CONFIG_X86_ESPFIX32 ldt_ss: -#ifdef CONFIG_PARAVIRT - /* - * The kernel can't run on a non-flat stack if paravirt mode - * is active. Rather than try to fixup the high bits of - * ESP, bypass this code entirely. This may break DOSemu - * and/or Wine support in a paravirt VM, although the option - * is still available to implement the setting of the high - * 16-bits in the INTERRUPT_RETURN paravirt-op. - */ - cmpl $0, pv_info+PARAVIRT_enabled - jne restore_nocheck -#endif - /* * Setup and switch to ESPFIX stack * @@ -632,14 +736,6 @@ ENTRY(spurious_interrupt_bug) END(spurious_interrupt_bug) #ifdef CONFIG_XEN -/* - * Xen doesn't set %esp to be precisely what the normal SYSENTER - * entry point expects, so fix it up before using the normal path. - */ -ENTRY(xen_sysenter_target) - addl $5*4, %esp /* remove xen-provided frame */ - jmp sysenter_past_esp - ENTRY(xen_hypervisor_callback) pushl $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL @@ -939,51 +1035,48 @@ error_code: jmp ret_from_exception END(page_fault) -/* - * Debug traps and NMI can happen at the one SYSENTER instruction - * that sets up the real kernel stack. Check here, since we can't - * allow the wrong stack to be used. - * - * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have - * already pushed 3 words if it hits on the sysenter instruction: - * eflags, cs and eip. - * - * We just load the right stack, and push the three (known) values - * by hand onto the new stack - while updating the return eip past - * the instruction that would have done it for sysenter. - */ -.macro FIX_STACK offset ok label - cmpw $__KERNEL_CS, 4(%esp) - jne \ok -\label: - movl TSS_sysenter_sp0 + \offset(%esp), %esp - pushfl - pushl $__KERNEL_CS - pushl $sysenter_past_esp -.endm - ENTRY(debug) + /* + * #DB can happen at the first instruction of + * entry_SYSENTER_32 or in Xen's SYSENTER prologue. If this + * happens, then we will be running on a very small stack. We + * need to detect this condition and switch to the thread + * stack before calling any C code at all. + * + * If you edit this code, keep in mind that NMIs can happen in here. + */ ASM_CLAC - cmpl $entry_SYSENTER_32, (%esp) - jne debug_stack_correct - FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn -debug_stack_correct: pushl $-1 # mark this as an int SAVE_ALL - TRACE_IRQS_OFF xorl %edx, %edx # error code 0 movl %esp, %eax # pt_regs pointer + + /* Are we currently on the SYSENTER stack? */ + PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) + subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ + cmpl $SIZEOF_SYSENTER_stack, %ecx + jb .Ldebug_from_sysenter_stack + + TRACE_IRQS_OFF call do_debug jmp ret_from_exception + +.Ldebug_from_sysenter_stack: + /* We're on the SYSENTER stack. Switch off. */ + movl %esp, %ebp + movl PER_CPU_VAR(cpu_current_top_of_stack), %esp + TRACE_IRQS_OFF + call do_debug + movl %ebp, %esp + jmp ret_from_exception END(debug) /* - * NMI is doubly nasty. It can happen _while_ we're handling - * a debug fault, and the debug fault hasn't yet been able to - * clear up the stack. So we first check whether we got an - * NMI on the sysenter entry path, but after that we need to - * check whether we got an NMI on the debug path where the debug - * fault happened on the sysenter path. + * NMI is doubly nasty. It can happen on the first instruction of + * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning + * of the #DB handler even if that #DB in turn hit before entry_SYSENTER_32 + * switched stacks. We handle both conditions by simply checking whether we + * interrupted kernel code running on the SYSENTER stack. */ ENTRY(nmi) ASM_CLAC @@ -994,41 +1087,32 @@ ENTRY(nmi) popl %eax je nmi_espfix_stack #endif - cmpl $entry_SYSENTER_32, (%esp) - je nmi_stack_fixup - pushl %eax - movl %esp, %eax - /* - * Do not access memory above the end of our stack page, - * it might not exist. - */ - andl $(THREAD_SIZE-1), %eax - cmpl $(THREAD_SIZE-20), %eax - popl %eax - jae nmi_stack_correct - cmpl $entry_SYSENTER_32, 12(%esp) - je nmi_debug_stack_check -nmi_stack_correct: - pushl %eax + + pushl %eax # pt_regs->orig_ax SAVE_ALL xorl %edx, %edx # zero error code movl %esp, %eax # pt_regs pointer + + /* Are we currently on the SYSENTER stack? */ + PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) + subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ + cmpl $SIZEOF_SYSENTER_stack, %ecx + jb .Lnmi_from_sysenter_stack + + /* Not on SYSENTER stack. */ call do_nmi jmp restore_all_notrace -nmi_stack_fixup: - FIX_STACK 12, nmi_stack_correct, 1 - jmp nmi_stack_correct - -nmi_debug_stack_check: - cmpw $__KERNEL_CS, 16(%esp) - jne nmi_stack_correct - cmpl $debug, (%esp) - jb nmi_stack_correct - cmpl $debug_esp_fix_insn, (%esp) - ja nmi_stack_correct - FIX_STACK 24, nmi_stack_correct, 1 - jmp nmi_stack_correct +.Lnmi_from_sysenter_stack: + /* + * We're on the SYSENTER stack. Switch off. No one (not even debug) + * is using the thread stack right now, so it's safe for us to use it. + */ + movl %esp, %ebp + movl PER_CPU_VAR(cpu_current_top_of_stack), %esp + call do_nmi + movl %ebp, %esp + jmp restore_all_notrace #ifdef CONFIG_X86_ESPFIX32 nmi_espfix_stack: diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 9d34d3cfceb6..858b555e274b 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -103,6 +103,16 @@ ENDPROC(native_usergs_sysret64) /* * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers. * + * This is the only entry point used for 64-bit system calls. The + * hardware interface is reasonably well designed and the register to + * argument mapping Linux uses fits well with the registers that are + * available when SYSCALL is used. + * + * SYSCALL instructions can be found inlined in libc implementations as + * well as some other programs and libraries. There are also a handful + * of SYSCALL instructions in the vDSO used, for example, as a + * clock_gettimeofday fallback. + * * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, * then loads new ss, cs, and rip from previously programmed MSRs. * rflags gets masked by a value from another MSR (so CLD and CLAC @@ -145,17 +155,11 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) movq %rsp, PER_CPU_VAR(rsp_scratch) movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + TRACE_IRQS_OFF + /* Construct struct pt_regs on stack */ pushq $__USER_DS /* pt_regs->ss */ pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ - /* - * Re-enable interrupts. - * We use 'rsp_scratch' as a scratch space, hence irq-off block above - * must execute atomically in the face of possible interrupt-driven - * task preemption. We must enable interrupts only after we're done - * with using rsp_scratch: - */ - ENABLE_INTERRUPTS(CLBR_NONE) pushq %r11 /* pt_regs->flags */ pushq $__USER_CS /* pt_regs->cs */ pushq %rcx /* pt_regs->ip */ @@ -171,9 +175,21 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) pushq %r11 /* pt_regs->r11 */ sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz tracesys + /* + * If we need to do entry work or if we guess we'll need to do + * exit work, go straight to the slow path. + */ + testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz entry_SYSCALL64_slow_path + entry_SYSCALL_64_fastpath: + /* + * Easy case: enable interrupts and issue the syscall. If the syscall + * needs pt_regs, we'll call a stub that disables interrupts again + * and jumps to the slow path. + */ + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) #if __SYSCALL_MASK == ~0 cmpq $__NR_syscall_max, %rax #else @@ -182,103 +198,56 @@ entry_SYSCALL_64_fastpath: #endif ja 1f /* return -ENOSYS (already in pt_regs->ax) */ movq %r10, %rcx + + /* + * This call instruction is handled specially in stub_ptregs_64. + * It might end up jumping to the slow path. If it jumps, RAX + * and all argument registers are clobbered. + */ call *sys_call_table(, %rax, 8) +.Lentry_SYSCALL_64_after_fastpath_call: + movq %rax, RAX(%rsp) 1: -/* - * Syscall return path ending with SYSRET (fast path). - * Has incompletely filled pt_regs. - */ - LOCKDEP_SYS_EXIT + /* - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. + * If we get here, then we know that pt_regs is clean for SYSRET64. + * If we see that no exit work is required (which we are required + * to check with IRQs off), then we can go straight to SYSRET64. */ DISABLE_INTERRUPTS(CLBR_NONE) - - /* - * We must check ti flags with interrupts (or at least preemption) - * off because we must *never* return to userspace without - * processing exit work that is enqueued if we're preempted here. - * In particular, returning to userspace with any of the one-shot - * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is - * very bad. - */ + TRACE_IRQS_OFF testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ + jnz 1f - RESTORE_C_REGS_EXCEPT_RCX_R11 + LOCKDEP_SYS_EXIT + TRACE_IRQS_ON /* user mode is traced as IRQs on */ movq RIP(%rsp), %rcx movq EFLAGS(%rsp), %r11 + RESTORE_C_REGS_EXCEPT_RCX_R11 movq RSP(%rsp), %rsp - /* - * 64-bit SYSRET restores rip from rcx, - * rflags from r11 (but RF and VM bits are forced to 0), - * cs and ss are loaded from MSRs. - * Restoration of rflags re-enables interrupts. - * - * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss - * descriptor is not reinitialized. This means that we should - * avoid SYSRET with SS == NULL, which could happen if we schedule, - * exit the kernel, and re-enter using an interrupt vector. (All - * interrupt entries on x86_64 set SS to NULL.) We prevent that - * from happening by reloading SS in __switch_to. (Actually - * detecting the failure in 64-bit userspace is tricky but can be - * done.) - */ USERGS_SYSRET64 -GLOBAL(int_ret_from_sys_call_irqs_off) +1: + /* + * The fast path looked good when we started, but something changed + * along the way and we need to switch to the slow path. Calling + * raise(3) will trigger this, for example. IRQs are off. + */ TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - jmp int_ret_from_sys_call - - /* Do syscall entry tracing */ -tracesys: - movq %rsp, %rdi - movl $AUDIT_ARCH_X86_64, %esi - call syscall_trace_enter_phase1 - test %rax, %rax - jnz tracesys_phase2 /* if needed, run the slow path */ - RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */ - movq ORIG_RAX(%rsp), %rax - jmp entry_SYSCALL_64_fastpath /* and return to the fast path */ - -tracesys_phase2: - SAVE_EXTRA_REGS - movq %rsp, %rdi - movl $AUDIT_ARCH_X86_64, %esi - movq %rax, %rdx - call syscall_trace_enter_phase2 - - /* - * Reload registers from stack in case ptrace changed them. - * We don't reload %rax because syscall_trace_entry_phase2() returned - * the value it wants us to use in the table lookup. - */ - RESTORE_C_REGS_EXCEPT_RAX - RESTORE_EXTRA_REGS -#if __SYSCALL_MASK == ~0 - cmpq $__NR_syscall_max, %rax -#else - andl $__SYSCALL_MASK, %eax - cmpl $__NR_syscall_max, %eax -#endif - ja 1f /* return -ENOSYS (already in pt_regs->ax) */ - movq %r10, %rcx /* fixup for C */ - call *sys_call_table(, %rax, 8) - movq %rax, RAX(%rsp) -1: - /* Use IRET because user could have changed pt_regs->foo */ - -/* - * Syscall return path ending with IRET. - * Has correct iret frame. - */ -GLOBAL(int_ret_from_sys_call) SAVE_EXTRA_REGS movq %rsp, %rdi call syscall_return_slowpath /* returns with IRQs disabled */ + jmp return_from_SYSCALL_64 + +entry_SYSCALL64_slow_path: + /* IRQs are off. */ + SAVE_EXTRA_REGS + movq %rsp, %rdi + call do_syscall_64 /* returns with IRQs disabled */ + +return_from_SYSCALL_64: RESTORE_EXTRA_REGS TRACE_IRQS_IRETQ /* we're about to change IF */ @@ -355,83 +324,45 @@ opportunistic_sysret_failed: jmp restore_c_regs_and_iret END(entry_SYSCALL_64) - - .macro FORK_LIKE func -ENTRY(stub_\func) - SAVE_EXTRA_REGS 8 - jmp sys_\func -END(stub_\func) - .endm - - FORK_LIKE clone - FORK_LIKE fork - FORK_LIKE vfork - -ENTRY(stub_execve) - call sys_execve -return_from_execve: - testl %eax, %eax - jz 1f - /* exec failed, can use fast SYSRET code path in this case */ - ret -1: - /* must use IRET code path (pt_regs->cs may have changed) */ - addq $8, %rsp - ZERO_EXTRA_REGS - movq %rax, RAX(%rsp) - jmp int_ret_from_sys_call -END(stub_execve) -/* - * Remaining execve stubs are only 7 bytes long. - * ENTRY() often aligns to 16 bytes, which in this case has no benefits. - */ - .align 8 -GLOBAL(stub_execveat) - call sys_execveat - jmp return_from_execve -END(stub_execveat) - -#if defined(CONFIG_X86_X32_ABI) - .align 8 -GLOBAL(stub_x32_execve) - call compat_sys_execve - jmp return_from_execve -END(stub_x32_execve) - .align 8 -GLOBAL(stub_x32_execveat) - call compat_sys_execveat - jmp return_from_execve -END(stub_x32_execveat) -#endif - -/* - * sigreturn is special because it needs to restore all registers on return. - * This cannot be done with SYSRET, so use the IRET return path instead. - */ -ENTRY(stub_rt_sigreturn) +ENTRY(stub_ptregs_64) /* - * SAVE_EXTRA_REGS result is not normally needed: - * sigreturn overwrites all pt_regs->GPREGS. - * But sigreturn can fail (!), and there is no easy way to detect that. - * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error, - * we SAVE_EXTRA_REGS here. + * Syscalls marked as needing ptregs land here. + * If we are on the fast path, we need to save the extra regs, + * which we achieve by trying again on the slow path. If we are on + * the slow path, the extra regs are already saved. + * + * RAX stores a pointer to the C function implementing the syscall. + * IRQs are on. */ - SAVE_EXTRA_REGS 8 - call sys_rt_sigreturn -return_from_stub: - addq $8, %rsp - RESTORE_EXTRA_REGS - movq %rax, RAX(%rsp) - jmp int_ret_from_sys_call -END(stub_rt_sigreturn) + cmpq $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp) + jne 1f -#ifdef CONFIG_X86_X32_ABI -ENTRY(stub_x32_rt_sigreturn) - SAVE_EXTRA_REGS 8 - call sys32_x32_rt_sigreturn - jmp return_from_stub -END(stub_x32_rt_sigreturn) -#endif + /* + * Called from fast path -- disable IRQs again, pop return address + * and jump to slow path + */ + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + popq %rax + jmp entry_SYSCALL64_slow_path + +1: + /* Called from C */ + jmp *%rax /* called from C */ +END(stub_ptregs_64) + +.macro ptregs_stub func +ENTRY(ptregs_\func) + leaq \func(%rip), %rax + jmp stub_ptregs_64 +END(ptregs_\func) +.endm + +/* Instantiate ptregs_stub for each ptregs-using syscall */ +#define __SYSCALL_64_QUAL_(sym) +#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym +#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym) +#include /* * A newly forked process directly context switches into this address. @@ -439,7 +370,6 @@ END(stub_x32_rt_sigreturn) * rdi: prev task we switched from */ ENTRY(ret_from_fork) - LOCK ; btr $TIF_FORK, TI_flags(%r8) pushq $0x0002 @@ -447,28 +377,32 @@ ENTRY(ret_from_fork) call schedule_tail /* rdi: 'prev' task parameter */ - RESTORE_EXTRA_REGS - testb $3, CS(%rsp) /* from kernel_thread? */ + jnz 1f /* - * By the time we get here, we have no idea whether our pt_regs, - * ti flags, and ti status came from the 64-bit SYSCALL fast path, - * the slow path, or one of the 32-bit compat paths. - * Use IRET code path to return, since it can safely handle - * all of the above. + * We came from kernel_thread. This code path is quite twisted, and + * someone should clean it up. + * + * copy_thread_tls stashes the function pointer in RBX and the + * parameter to be passed in RBP. The called function is permitted + * to call do_execve and thereby jump to user mode. */ - jnz int_ret_from_sys_call - - /* - * We came from kernel_thread - * nb: we depend on RESTORE_EXTRA_REGS above - */ - movq %rbp, %rdi - call *%rbx + movq RBP(%rsp), %rdi + call *RBX(%rsp) movl $0, RAX(%rsp) - RESTORE_EXTRA_REGS - jmp int_ret_from_sys_call + + /* + * Fall through as though we're exiting a syscall. This makes a + * twisted sort of sense if we just called do_execve. + */ + +1: + movq %rsp, %rdi + call syscall_return_slowpath /* returns with IRQs disabled */ + TRACE_IRQS_ON /* user mode is traced as IRQS on */ + SWAPGS + jmp restore_regs_and_iret END(ret_from_fork) /* diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 3c990eeee40b..847f2f0c31e5 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -19,12 +19,21 @@ .section .entry.text, "ax" /* - * 32-bit SYSENTER instruction entry. + * 32-bit SYSENTER entry. * - * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. - * IF and VM in rflags are cleared (IOW: interrupts are off). + * 32-bit system calls through the vDSO's __kernel_vsyscall enter here + * on 64-bit kernels running on Intel CPUs. + * + * The SYSENTER instruction, in principle, should *only* occur in the + * vDSO. In practice, a small number of Android devices were shipped + * with a copy of Bionic that inlined a SYSENTER instruction. This + * never happened in any of Google's Bionic versions -- it only happened + * in a narrow range of Intel-provided versions. + * + * SYSENTER loads SS, RSP, CS, and RIP from previously programmed MSRs. + * IF and VM in RFLAGS are cleared (IOW: interrupts are off). * SYSENTER does not save anything on the stack, - * and does not save old rip (!!!) and rflags. + * and does not save old RIP (!!!), RSP, or RFLAGS. * * Arguments: * eax system call number @@ -35,10 +44,6 @@ * edi arg5 * ebp user stack * 0(%ebp) arg6 - * - * This is purely a fast path. For anything complicated we use the int 0x80 - * path below. We set up a complete hardware stack frame to share code - * with the int 0x80 path. */ ENTRY(entry_SYSENTER_compat) /* Interrupts are off on entry. */ @@ -66,8 +71,6 @@ ENTRY(entry_SYSENTER_compat) */ pushfq /* pt_regs->flags (except IF = 0) */ orl $X86_EFLAGS_IF, (%rsp) /* Fix saved flags */ - ASM_CLAC /* Clear AC after saving FLAGS */ - pushq $__USER32_CS /* pt_regs->cs */ xorq %r8,%r8 pushq %r8 /* pt_regs->ip = 0 (placeholder) */ @@ -90,19 +93,25 @@ ENTRY(entry_SYSENTER_compat) cld /* - * Sysenter doesn't filter flags, so we need to clear NT + * SYSENTER doesn't filter flags, so we need to clear NT and AC * ourselves. To save a few cycles, we can check whether - * NT was set instead of doing an unconditional popfq. + * either was set instead of doing an unconditional popfq. * This needs to happen before enabling interrupts so that * we don't get preempted with NT set. * + * If TF is set, we will single-step all the way to here -- do_debug + * will ignore all the traps. (Yes, this is slow, but so is + * single-stepping in general. This allows us to avoid having + * a more complicated code to handle the case where a user program + * forces us to single-step through the SYSENTER entry code.) + * * NB.: .Lsysenter_fix_flags is a label with the code under it moved * out-of-line as an optimization: NT is unlikely to be set in the * majority of the cases and instead of polluting the I$ unnecessarily, * we're keeping that code behind a branch which will predict as * not-taken and therefore its instructions won't be fetched. */ - testl $X86_EFLAGS_NT, EFLAGS(%rsp) + testl $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, EFLAGS(%rsp) jnz .Lsysenter_fix_flags .Lsysenter_flags_fixed: @@ -123,20 +132,42 @@ ENTRY(entry_SYSENTER_compat) pushq $X86_EFLAGS_FIXED popfq jmp .Lsysenter_flags_fixed +GLOBAL(__end_entry_SYSENTER_compat) ENDPROC(entry_SYSENTER_compat) /* - * 32-bit SYSCALL instruction entry. + * 32-bit SYSCALL entry. * - * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, - * then loads new ss, cs, and rip from previously programmed MSRs. - * rflags gets masked by a value from another MSR (so CLD and CLAC - * are not needed). SYSCALL does not save anything on the stack - * and does not change rsp. + * 32-bit system calls through the vDSO's __kernel_vsyscall enter here + * on 64-bit kernels running on AMD CPUs. * - * Note: rflags saving+masking-with-MSR happens only in Long mode + * The SYSCALL instruction, in principle, should *only* occur in the + * vDSO. In practice, it appears that this really is the case. + * As evidence: + * + * - The calling convention for SYSCALL has changed several times without + * anyone noticing. + * + * - Prior to the in-kernel X86_BUG_SYSRET_SS_ATTRS fixup, anything + * user task that did SYSCALL without immediately reloading SS + * would randomly crash. + * + * - Most programmers do not directly target AMD CPUs, and the 32-bit + * SYSCALL instruction does not exist on Intel CPUs. Even on AMD + * CPUs, Linux disables the SYSCALL instruction on 32-bit kernels + * because the SYSCALL instruction in legacy/native 32-bit mode (as + * opposed to compat mode) is sufficiently poorly designed as to be + * essentially unusable. + * + * 32-bit SYSCALL saves RIP to RCX, clears RFLAGS.RF, then saves + * RFLAGS to R11, then loads new SS, CS, and RIP from previously + * programmed MSRs. RFLAGS gets masked by a value from another MSR + * (so CLD and CLAC are not needed). SYSCALL does not save anything on + * the stack and does not change RSP. + * + * Note: RFLAGS saving+masking-with-MSR happens only in Long mode * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it). - * Don't get confused: rflags saving+masking depends on Long Mode Active bit + * Don't get confused: RFLAGS saving+masking depends on Long Mode Active bit * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). * @@ -236,7 +267,21 @@ sysret32_from_system_call: END(entry_SYSCALL_compat) /* - * Emulated IA32 system calls via int 0x80. + * 32-bit legacy system call entry. + * + * 32-bit x86 Linux system calls traditionally used the INT $0x80 + * instruction. INT $0x80 lands here. + * + * This entry point can be used by 32-bit and 64-bit programs to perform + * 32-bit system calls. Instances of INT $0x80 can be found inline in + * various programs and libraries. It is also used by the vDSO's + * __kernel_vsyscall fallback for hardware that doesn't support a faster + * entry method. Restarted 32-bit system calls also fall back to INT + * $0x80 regardless of what instruction was originally used to do the + * system call. + * + * This is considered a slow path. It is not used by most libc + * implementations on modern hardware except during process startup. * * Arguments: * eax system call number @@ -245,17 +290,8 @@ END(entry_SYSCALL_compat) * edx arg3 * esi arg4 * edi arg5 - * ebp arg6 (note: not saved in the stack frame, should not be touched) - * - * Notes: - * Uses the same stack frame as the x86-64 version. - * All registers except eax must be saved (but ptrace may violate that). - * Arguments are zero extended. For system calls that want sign extension and - * take long arguments a wrapper is needed. Most calls can just be called - * directly. - * Assumes it is only called from user space and entered with interrupts off. + * ebp arg6 */ - ENTRY(entry_INT80_compat) /* * Interrupts are off on entry. @@ -300,7 +336,7 @@ ENTRY(entry_INT80_compat) TRACE_IRQS_OFF movq %rsp, %rdi - call do_syscall_32_irqs_off + call do_int80_syscall_32 .Lsyscall_32_done: /* Go back to user mode. */ diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 9a6649857106..8f895ee13a1c 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -6,17 +6,11 @@ #include #include -#ifdef CONFIG_IA32_EMULATION -#define SYM(sym, compat) compat -#else -#define SYM(sym, compat) sym -#endif - -#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long SYM(sym, compat)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; +#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include #undef __SYSCALL_I386 -#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat), +#define __SYSCALL_I386(nr, sym, qual) [nr] = sym, extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index 41283d22be7a..9dbc5abb6162 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -6,19 +6,14 @@ #include #include -#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) +#define __SYSCALL_64_QUAL_(sym) sym +#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_##sym -#ifdef CONFIG_X86_X32_ABI -# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat) -#else -# define __SYSCALL_X32(nr, sym, compat) /* nothing */ -#endif - -#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; +#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); #include #undef __SYSCALL_64 -#define __SYSCALL_64(nr, sym, compat) [nr] = sym, +#define __SYSCALL_64(nr, sym, qual) [nr] = __SYSCALL_64_QUAL_##qual(sym), extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index dc1040a50bdc..2e5b565adacc 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -21,7 +21,7 @@ 12 common brk sys_brk 13 64 rt_sigaction sys_rt_sigaction 14 common rt_sigprocmask sys_rt_sigprocmask -15 64 rt_sigreturn stub_rt_sigreturn +15 64 rt_sigreturn sys_rt_sigreturn/ptregs 16 64 ioctl sys_ioctl 17 common pread64 sys_pread64 18 common pwrite64 sys_pwrite64 @@ -62,10 +62,10 @@ 53 common socketpair sys_socketpair 54 64 setsockopt sys_setsockopt 55 64 getsockopt sys_getsockopt -56 common clone stub_clone -57 common fork stub_fork -58 common vfork stub_vfork -59 64 execve stub_execve +56 common clone sys_clone/ptregs +57 common fork sys_fork/ptregs +58 common vfork sys_vfork/ptregs +59 64 execve sys_execve/ptregs 60 common exit sys_exit 61 common wait4 sys_wait4 62 common kill sys_kill @@ -178,7 +178,7 @@ 169 common reboot sys_reboot 170 common sethostname sys_sethostname 171 common setdomainname sys_setdomainname -172 common iopl sys_iopl +172 common iopl sys_iopl/ptregs 173 common ioperm sys_ioperm 174 64 create_module 175 common init_module sys_init_module @@ -328,7 +328,7 @@ 319 common memfd_create sys_memfd_create 320 common kexec_file_load sys_kexec_file_load 321 common bpf sys_bpf -322 64 execveat stub_execveat +322 64 execveat sys_execveat/ptregs 323 common userfaultfd sys_userfaultfd 324 common membarrier sys_membarrier 325 common mlock2 sys_mlock2 @@ -339,14 +339,14 @@ # for native 64-bit operation. # 512 x32 rt_sigaction compat_sys_rt_sigaction -513 x32 rt_sigreturn stub_x32_rt_sigreturn +513 x32 rt_sigreturn sys32_x32_rt_sigreturn 514 x32 ioctl compat_sys_ioctl 515 x32 readv compat_sys_readv 516 x32 writev compat_sys_writev 517 x32 recvfrom compat_sys_recvfrom 518 x32 sendmsg compat_sys_sendmsg 519 x32 recvmsg compat_sys_recvmsg -520 x32 execve stub_x32_execve +520 x32 execve compat_sys_execve/ptregs 521 x32 ptrace compat_sys_ptrace 522 x32 rt_sigpending compat_sys_rt_sigpending 523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait @@ -371,4 +371,4 @@ 542 x32 getsockopt compat_sys_getsockopt 543 x32 io_setup compat_sys_io_setup 544 x32 io_submit compat_sys_io_submit -545 x32 execveat stub_x32_execveat +545 x32 execveat compat_sys_execveat/ptregs diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh index 0e7f8ec071e7..cd3d3015d7df 100644 --- a/arch/x86/entry/syscalls/syscalltbl.sh +++ b/arch/x86/entry/syscalls/syscalltbl.sh @@ -3,13 +3,63 @@ in="$1" out="$2" +syscall_macro() { + abi="$1" + nr="$2" + entry="$3" + + # Entry can be either just a function name or "function/qualifier" + real_entry="${entry%%/*}" + qualifier="${entry:${#real_entry}}" # Strip the function name + qualifier="${qualifier:1}" # Strip the slash, if any + + echo "__SYSCALL_${abi}($nr, $real_entry, $qualifier)" +} + +emit() { + abi="$1" + nr="$2" + entry="$3" + compat="$4" + + if [ "$abi" == "64" -a -n "$compat" ]; then + echo "a compat entry for a 64-bit syscall makes no sense" >&2 + exit 1 + fi + + if [ -z "$compat" ]; then + if [ -n "$entry" ]; then + syscall_macro "$abi" "$nr" "$entry" + fi + else + echo "#ifdef CONFIG_X86_32" + if [ -n "$entry" ]; then + syscall_macro "$abi" "$nr" "$entry" + fi + echo "#else" + syscall_macro "$abi" "$nr" "$compat" + echo "#endif" + fi +} + grep '^[0-9]' "$in" | sort -n | ( while read nr abi name entry compat; do abi=`echo "$abi" | tr '[a-z]' '[A-Z]'` - if [ -n "$compat" ]; then - echo "__SYSCALL_${abi}($nr, $entry, $compat)" - elif [ -n "$entry" ]; then - echo "__SYSCALL_${abi}($nr, $entry, $entry)" + if [ "$abi" == "COMMON" -o "$abi" == "64" ]; then + # COMMON is the same as 64, except that we don't expect X32 + # programs to use it. Our expectation has nothing to do with + # any generated code, so treat them the same. + emit 64 "$nr" "$entry" "$compat" + elif [ "$abi" == "X32" ]; then + # X32 is equivalent to 64 on an X32-compatible kernel. + echo "#ifdef CONFIG_X86_X32_ABI" + emit 64 "$nr" "$entry" "$compat" + echo "#endif" + elif [ "$abi" == "I386" ]; then + emit "$abi" "$nr" "$entry" "$compat" + else + echo "Unknown abi $abi" >&2 + exit 1 fi done ) > "$out" diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h index 3f69326ed545..63a03bb91497 100644 --- a/arch/x86/entry/vdso/vdso2c.h +++ b/arch/x86/entry/vdso/vdso2c.h @@ -150,16 +150,9 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, } fprintf(outfile, "\n};\n\n"); - fprintf(outfile, "static struct page *pages[%lu];\n\n", - mapping_size / 4096); - fprintf(outfile, "const struct vdso_image %s = {\n", name); fprintf(outfile, "\t.data = raw_data,\n"); fprintf(outfile, "\t.size = %lu,\n", mapping_size); - fprintf(outfile, "\t.text_mapping = {\n"); - fprintf(outfile, "\t\t.name = \"[vdso]\",\n"); - fprintf(outfile, "\t\t.pages = pages,\n"); - fprintf(outfile, "\t},\n"); if (alt_sec) { fprintf(outfile, "\t.alt = %lu,\n", (unsigned long)GET_LE(&alt_sec->sh_offset)); diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c index 08a317a9ae4b..7853b53959cd 100644 --- a/arch/x86/entry/vdso/vdso32-setup.c +++ b/arch/x86/entry/vdso/vdso32-setup.c @@ -11,7 +11,6 @@ #include #include -#include #include #include diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S index 3a1d9297074b..0109ac6cb79c 100644 --- a/arch/x86/entry/vdso/vdso32/system_call.S +++ b/arch/x86/entry/vdso/vdso32/system_call.S @@ -3,7 +3,7 @@ */ #include -#include +#include #include /* diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index b8f69e264ac4..10f704584922 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -20,6 +20,7 @@ #include #include #include +#include #if defined(CONFIG_X86_64) unsigned int __read_mostly vdso64_enabled = 1; @@ -27,13 +28,7 @@ unsigned int __read_mostly vdso64_enabled = 1; void __init init_vdso_image(const struct vdso_image *image) { - int i; - int npages = (image->size) / PAGE_SIZE; - BUG_ON(image->size % PAGE_SIZE != 0); - for (i = 0; i < npages; i++) - image->text_mapping.pages[i] = - virt_to_page(image->data + i*PAGE_SIZE); apply_alternatives((struct alt_instr *)(image->data + image->alt), (struct alt_instr *)(image->data + image->alt + @@ -90,18 +85,87 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) #endif } +static int vdso_fault(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + const struct vdso_image *image = vma->vm_mm->context.vdso_image; + + if (!image || (vmf->pgoff << PAGE_SHIFT) >= image->size) + return VM_FAULT_SIGBUS; + + vmf->page = virt_to_page(image->data + (vmf->pgoff << PAGE_SHIFT)); + get_page(vmf->page); + return 0; +} + +static const struct vm_special_mapping text_mapping = { + .name = "[vdso]", + .fault = vdso_fault, +}; + +static int vvar_fault(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + const struct vdso_image *image = vma->vm_mm->context.vdso_image; + long sym_offset; + int ret = -EFAULT; + + if (!image) + return VM_FAULT_SIGBUS; + + sym_offset = (long)(vmf->pgoff << PAGE_SHIFT) + + image->sym_vvar_start; + + /* + * Sanity check: a symbol offset of zero means that the page + * does not exist for this vdso image, not that the page is at + * offset zero relative to the text mapping. This should be + * impossible here, because sym_offset should only be zero for + * the page past the end of the vvar mapping. + */ + if (sym_offset == 0) + return VM_FAULT_SIGBUS; + + if (sym_offset == image->sym_vvar_page) { + ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, + __pa_symbol(&__vvar_page) >> PAGE_SHIFT); + } else if (sym_offset == image->sym_hpet_page) { +#ifdef CONFIG_HPET_TIMER + if (hpet_address && vclock_was_used(VCLOCK_HPET)) { + ret = vm_insert_pfn_prot( + vma, + (unsigned long)vmf->virtual_address, + hpet_address >> PAGE_SHIFT, + pgprot_noncached(PAGE_READONLY)); + } +#endif + } else if (sym_offset == image->sym_pvclock_page) { + struct pvclock_vsyscall_time_info *pvti = + pvclock_pvti_cpu0_va(); + if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) { + ret = vm_insert_pfn( + vma, + (unsigned long)vmf->virtual_address, + __pa(pvti) >> PAGE_SHIFT); + } + } + + if (ret == 0 || ret == -EBUSY) + return VM_FAULT_NOPAGE; + + return VM_FAULT_SIGBUS; +} + static int map_vdso(const struct vdso_image *image, bool calculate_addr) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long addr, text_start; int ret = 0; - static struct page *no_pages[] = {NULL}; - static struct vm_special_mapping vvar_mapping = { + static const struct vm_special_mapping vvar_mapping = { .name = "[vvar]", - .pages = no_pages, + .fault = vvar_fault, }; - struct pvclock_vsyscall_time_info *pvti; if (calculate_addr) { addr = vdso_addr(current->mm->start_stack, @@ -121,6 +185,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) text_start = addr - image->sym_vvar_start; current->mm->context.vdso = (void __user *)text_start; + current->mm->context.vdso_image = image; /* * MAYWRITE to allow gdb to COW and set breakpoints @@ -130,7 +195,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) image->size, VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - &image->text_mapping); + &text_mapping); if (IS_ERR(vma)) { ret = PTR_ERR(vma); @@ -140,7 +205,8 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) vma = _install_special_mapping(mm, addr, -image->sym_vvar_start, - VM_READ|VM_MAYREAD, + VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP| + VM_PFNMAP, &vvar_mapping); if (IS_ERR(vma)) { @@ -148,41 +214,6 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) goto up_fail; } - if (image->sym_vvar_page) - ret = remap_pfn_range(vma, - text_start + image->sym_vvar_page, - __pa_symbol(&__vvar_page) >> PAGE_SHIFT, - PAGE_SIZE, - PAGE_READONLY); - - if (ret) - goto up_fail; - -#ifdef CONFIG_HPET_TIMER - if (hpet_address && image->sym_hpet_page) { - ret = io_remap_pfn_range(vma, - text_start + image->sym_hpet_page, - hpet_address >> PAGE_SHIFT, - PAGE_SIZE, - pgprot_noncached(PAGE_READONLY)); - - if (ret) - goto up_fail; - } -#endif - - pvti = pvclock_pvti_cpu0_va(); - if (pvti && image->sym_pvclock_page) { - ret = remap_pfn_range(vma, - text_start + image->sym_pvclock_page, - __pa(pvti) >> PAGE_SHIFT, - PAGE_SIZE, - PAGE_READONLY); - - if (ret) - goto up_fail; - } - up_fail: if (ret) current->mm->context.vdso = NULL; @@ -254,7 +285,7 @@ static void vgetcpu_cpu_init(void *arg) #ifdef CONFIG_NUMA node = cpu_to_node(cpu); #endif - if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) + if (static_cpu_has(X86_FEATURE_RDTSCP)) write_rdtscp_aux((node << 12) | cpu); /* diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c index 51e330416995..0fb3a104ac62 100644 --- a/arch/x86/entry/vsyscall/vsyscall_gtod.c +++ b/arch/x86/entry/vsyscall/vsyscall_gtod.c @@ -16,6 +16,8 @@ #include #include +int vclocks_used __read_mostly; + DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); void update_vsyscall_tz(void) @@ -26,12 +28,17 @@ void update_vsyscall_tz(void) void update_vsyscall(struct timekeeper *tk) { + int vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; + /* Mark the new vclock used. */ + BUILD_BUG_ON(VCLOCK_MAX >= 32); + WRITE_ONCE(vclocks_used, READ_ONCE(vclocks_used) | (1 << vclock_mode)); + gtod_write_begin(vdata); /* copy vsyscall data */ - vdata->vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; + vdata->vclock_mode = vclock_mode; vdata->cycle_last = tk->tkr_mono.cycle_last; vdata->mask = tk->tkr_mono.mask; vdata->mult = tk->tkr_mono.mult; diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 7bfc85bbb8ff..99afb665a004 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -151,12 +151,6 @@ static inline int alternatives_text_reserved(void *start, void *end) ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ ".popsection" -/* - * This must be included *after* the definition of ALTERNATIVE due to - * - */ -#include - /* * Alternative instructions for different CPU types or capabilities. * diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index c80f6b6f3da2..0899cfc8dfe8 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -6,7 +6,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h index 259a7c1ef709..02e799fa43d1 100644 --- a/arch/x86/include/asm/arch_hweight.h +++ b/arch/x86/include/asm/arch_hweight.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_HWEIGHT_H #define _ASM_X86_HWEIGHT_H +#include + #ifdef CONFIG_64BIT /* popcnt %edi, %eax -- redundant REX prefix for alignment */ #define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7" diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index cfe3b954d5e4..7766d1cf096e 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -91,7 +91,7 @@ set_bit(long nr, volatile unsigned long *addr) * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ -static inline void __set_bit(long nr, volatile unsigned long *addr) +static __always_inline void __set_bit(long nr, volatile unsigned long *addr) { asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory"); } @@ -128,13 +128,13 @@ clear_bit(long nr, volatile unsigned long *addr) * clear_bit() is atomic and implies release semantics before the memory * operation. It can be used for an unlock. */ -static inline void clear_bit_unlock(long nr, volatile unsigned long *addr) +static __always_inline void clear_bit_unlock(long nr, volatile unsigned long *addr) { barrier(); clear_bit(nr, addr); } -static inline void __clear_bit(long nr, volatile unsigned long *addr) +static __always_inline void __clear_bit(long nr, volatile unsigned long *addr) { asm volatile("btr %1,%0" : ADDR : "Ir" (nr)); } @@ -151,7 +151,7 @@ static inline void __clear_bit(long nr, volatile unsigned long *addr) * No memory barrier is required here, because x86 cannot reorder stores past * older loads. Same principle as spin_unlock. */ -static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr) +static __always_inline void __clear_bit_unlock(long nr, volatile unsigned long *addr) { barrier(); __clear_bit(nr, addr); @@ -166,7 +166,7 @@ static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr) * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ -static inline void __change_bit(long nr, volatile unsigned long *addr) +static __always_inline void __change_bit(long nr, volatile unsigned long *addr) { asm volatile("btc %1,%0" : ADDR : "Ir" (nr)); } @@ -180,7 +180,7 @@ static inline void __change_bit(long nr, volatile unsigned long *addr) * Note that @nr may be almost arbitrarily large; this function is not * restricted to acting on a single-word quantity. */ -static inline void change_bit(long nr, volatile unsigned long *addr) +static __always_inline void change_bit(long nr, volatile unsigned long *addr) { if (IS_IMMEDIATE(nr)) { asm volatile(LOCK_PREFIX "xorb %1,%0" @@ -201,7 +201,7 @@ static inline void change_bit(long nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int test_and_set_bit(long nr, volatile unsigned long *addr) +static __always_inline int test_and_set_bit(long nr, volatile unsigned long *addr) { GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, "Ir", nr, "%0", "c"); } @@ -228,7 +228,7 @@ test_and_set_bit_lock(long nr, volatile unsigned long *addr) * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ -static inline int __test_and_set_bit(long nr, volatile unsigned long *addr) +static __always_inline int __test_and_set_bit(long nr, volatile unsigned long *addr) { int oldbit; @@ -247,7 +247,7 @@ static inline int __test_and_set_bit(long nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int test_and_clear_bit(long nr, volatile unsigned long *addr) +static __always_inline int test_and_clear_bit(long nr, volatile unsigned long *addr) { GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, "Ir", nr, "%0", "c"); } @@ -268,7 +268,7 @@ static inline int test_and_clear_bit(long nr, volatile unsigned long *addr) * accessed from a hypervisor on the same CPU if running in a VM: don't change * this without also updating arch/x86/kernel/kvm.c */ -static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr) +static __always_inline int __test_and_clear_bit(long nr, volatile unsigned long *addr) { int oldbit; @@ -280,7 +280,7 @@ static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr) } /* WARNING: non atomic and it can be reordered! */ -static inline int __test_and_change_bit(long nr, volatile unsigned long *addr) +static __always_inline int __test_and_change_bit(long nr, volatile unsigned long *addr) { int oldbit; @@ -300,7 +300,7 @@ static inline int __test_and_change_bit(long nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int test_and_change_bit(long nr, volatile unsigned long *addr) +static __always_inline int test_and_change_bit(long nr, volatile unsigned long *addr) { GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, "Ir", nr, "%0", "c"); } @@ -311,7 +311,7 @@ static __always_inline int constant_test_bit(long nr, const volatile unsigned lo (addr[nr >> _BITOPS_LONG_SHIFT])) != 0; } -static inline int variable_test_bit(long nr, volatile const unsigned long *addr) +static __always_inline int variable_test_bit(long nr, volatile const unsigned long *addr) { int oldbit; @@ -343,7 +343,7 @@ static int test_bit(int nr, const volatile unsigned long *addr); * * Undefined if no bit exists, so code should check against 0 first. */ -static inline unsigned long __ffs(unsigned long word) +static __always_inline unsigned long __ffs(unsigned long word) { asm("rep; bsf %1,%0" : "=r" (word) @@ -357,7 +357,7 @@ static inline unsigned long __ffs(unsigned long word) * * Undefined if no zero exists, so code should check against ~0UL first. */ -static inline unsigned long ffz(unsigned long word) +static __always_inline unsigned long ffz(unsigned long word) { asm("rep; bsf %1,%0" : "=r" (word) @@ -371,7 +371,7 @@ static inline unsigned long ffz(unsigned long word) * * Undefined if no set bit exists, so code should check against 0 first. */ -static inline unsigned long __fls(unsigned long word) +static __always_inline unsigned long __fls(unsigned long word) { asm("bsr %1,%0" : "=r" (word) @@ -393,7 +393,7 @@ static inline unsigned long __fls(unsigned long word) * set bit if value is nonzero. The first (least significant) bit * is at position 1. */ -static inline int ffs(int x) +static __always_inline int ffs(int x) { int r; @@ -434,7 +434,7 @@ static inline int ffs(int x) * set bit if value is nonzero. The last (most significant) bit is * at position 32. */ -static inline int fls(int x) +static __always_inline int fls(int x) { int r; diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h index eda81dc0f4ae..d194266acb28 100644 --- a/arch/x86/include/asm/clocksource.h +++ b/arch/x86/include/asm/clocksource.h @@ -3,10 +3,11 @@ #ifndef _ASM_X86_CLOCKSOURCE_H #define _ASM_X86_CLOCKSOURCE_H -#define VCLOCK_NONE 0 /* No vDSO clock available. */ -#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ -#define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */ -#define VCLOCK_PVCLOCK 3 /* vDSO should use vread_pvclock. */ +#define VCLOCK_NONE 0 /* No vDSO clock available. */ +#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ +#define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */ +#define VCLOCK_PVCLOCK 3 /* vDSO should use vread_pvclock. */ +#define VCLOCK_MAX 3 struct arch_clocksource_data { int vclock_mode; diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index ad19841eddfe..9733361fed6f 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -2,6 +2,7 @@ #define ASM_X86_CMPXCHG_H #include +#include #include /* Provides LOCK_PREFIX */ /* diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 7ad8c9464297..68e4e8258b84 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -1,288 +1,7 @@ -/* - * Defines x86 CPU feature bits - */ #ifndef _ASM_X86_CPUFEATURE_H #define _ASM_X86_CPUFEATURE_H -#ifndef _ASM_X86_REQUIRED_FEATURES_H -#include -#endif - -#ifndef _ASM_X86_DISABLED_FEATURES_H -#include -#endif - -#define NCAPINTS 16 /* N 32-bit words worth of info */ -#define NBUGINTS 1 /* N 32-bit bug flags */ - -/* - * Note: If the comment begins with a quoted string, that string is used - * in /proc/cpuinfo instead of the macro name. If the string is "", - * this feature bit is not displayed in /proc/cpuinfo at all. - */ - -/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */ -#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ -#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ -#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */ -#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */ -#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */ -#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */ -#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */ -#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */ -#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */ -#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */ -#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */ -#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */ -#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */ -#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */ -#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */ - /* (plus FCMOVcc, FCOMI with FPU) */ -#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */ -#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */ -#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */ -#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */ -#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */ -#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */ -#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */ -#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ -#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */ -#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */ -#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */ -#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */ -#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */ -#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */ -#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */ - -/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ -/* Don't duplicate feature flags which are redundant with Intel! */ -#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */ -#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */ -#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */ -#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */ -#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */ -#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */ -#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */ -#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */ -#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */ -#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */ - -/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */ -#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */ -#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */ -#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */ - -/* Other features, Linux-defined mapping, word 3 */ -/* This range is used for feature bits which conflict or are synthesized */ -#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */ -#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ -#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ -#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ -/* cpu types for specific tunings: */ -#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ -#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */ -#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ -#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ -#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ -#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */ -/* free, was #define X86_FEATURE_FXSAVE_LEAK ( 3*32+10) * "" FXSAVE leaks FOP/FIP/FOP */ -#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ -#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ -#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ -#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */ -#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */ -#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */ -#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */ -#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */ -/* free, was #define X86_FEATURE_11AP ( 3*32+19) * "" Bad local APIC aka 11AP */ -#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ -#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ -#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */ -#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ -#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ -/* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */ -#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */ -#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */ -#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ -#define X86_FEATURE_EAGER_FPU ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */ -#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ - -/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ -#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ -#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */ -#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */ -#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */ -#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */ -#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */ -#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */ -#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */ -#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ -#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ -#define X86_FEATURE_CID ( 4*32+10) /* Context ID */ -#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */ -#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ -#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */ -#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ -#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */ -#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */ -#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */ -#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */ -#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */ -#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */ -#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */ -#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */ -#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */ -#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */ -#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ -#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */ -#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */ -#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */ -#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */ -#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */ - -/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ -#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */ -#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */ -#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ -#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */ -#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */ -#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */ -#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */ -#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */ -#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ -#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ - -/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */ -#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ -#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */ -#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */ -#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */ -#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */ -#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */ -#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */ -#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */ -#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */ -#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */ -#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */ -#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */ -#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */ -#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */ -#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */ -#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */ -#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */ -#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */ -#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */ -#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */ -#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */ -#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ -#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ -#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */ -#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ - -/* - * Auxiliary flags: Linux defined - For features scattered in various - * CPUID levels like 0x6, 0xA etc, word 7. - * - * Reuse free bits when adding new feature flags! - */ - -#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ -#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ - -#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ -#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ - -#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ - -/* Virtualization flags: Linux defined, word 8 */ -#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ -#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ -#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */ -#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ -#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */ - -#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */ -#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ - - -/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ -#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ -#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */ -#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ -#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ -#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ -#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ -#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ -#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */ -#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ -#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ -#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ -#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ -#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ -#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ -#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ -#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ -#define X86_FEATURE_PCOMMIT ( 9*32+22) /* PCOMMIT instruction */ -#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ -#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ -#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ -#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ -#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ -#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */ - -/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */ -#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */ -#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */ -#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */ -#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */ - -/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */ -#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ - -/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */ -#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */ - -/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ -#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ - -/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */ -#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ -#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ -#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ -#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */ -#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */ -#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */ -#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */ -#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */ -#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ -#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ - -/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */ -#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ -#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ -#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ -#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */ -#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */ -#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */ -#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */ -#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */ -#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ -#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ - -/* - * BUG word(s) - */ -#define X86_BUG(x) (NCAPINTS*32 + (x)) - -#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */ -#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */ -#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */ -#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */ -#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */ -#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ -#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ -#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ -#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ +#include #if defined(__KERNEL__) && !defined(__ASSEMBLY__) @@ -369,8 +88,7 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; * is not relevant. */ #define cpu_feature_enabled(bit) \ - (__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : \ - cpu_has(&boot_cpu_data, bit)) + (__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : static_cpu_has(bit)) #define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) @@ -406,106 +124,19 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; #define cpu_has_osxsave boot_cpu_has(X86_FEATURE_OSXSAVE) #define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) /* - * Do not add any more of those clumsy macros - use static_cpu_has_safe() for + * Do not add any more of those clumsy macros - use static_cpu_has() for * fast paths and boot_cpu_has() otherwise! */ -#if __GNUC__ >= 4 && defined(CONFIG_X86_FAST_FEATURE_TESTS) -extern void warn_pre_alternatives(void); -extern bool __static_cpu_has_safe(u16 bit); - +#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS) /* * Static testing of CPU features. Used the same as boot_cpu_has(). - * These are only valid after alternatives have run, but will statically - * patch the target code for additional performance. + * These will statically patch the target code for additional + * performance. */ -static __always_inline __pure bool __static_cpu_has(u16 bit) +static __always_inline __pure bool _static_cpu_has(u16 bit) { -#ifdef CC_HAVE_ASM_GOTO - -#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS - - /* - * Catch too early usage of this before alternatives - * have run. - */ - asm_volatile_goto("1: jmp %l[t_warn]\n" - "2:\n" - ".section .altinstructions,\"a\"\n" - " .long 1b - .\n" - " .long 0\n" /* no replacement */ - " .word %P0\n" /* 1: do replace */ - " .byte 2b - 1b\n" /* source len */ - " .byte 0\n" /* replacement len */ - " .byte 0\n" /* pad len */ - ".previous\n" - /* skipping size check since replacement size = 0 */ - : : "i" (X86_FEATURE_ALWAYS) : : t_warn); - -#endif - - asm_volatile_goto("1: jmp %l[t_no]\n" - "2:\n" - ".section .altinstructions,\"a\"\n" - " .long 1b - .\n" - " .long 0\n" /* no replacement */ - " .word %P0\n" /* feature bit */ - " .byte 2b - 1b\n" /* source len */ - " .byte 0\n" /* replacement len */ - " .byte 0\n" /* pad len */ - ".previous\n" - /* skipping size check since replacement size = 0 */ - : : "i" (bit) : : t_no); - return true; - t_no: - return false; - -#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS - t_warn: - warn_pre_alternatives(); - return false; -#endif - -#else /* CC_HAVE_ASM_GOTO */ - - u8 flag; - /* Open-coded due to __stringify() in ALTERNATIVE() */ - asm volatile("1: movb $0,%0\n" - "2:\n" - ".section .altinstructions,\"a\"\n" - " .long 1b - .\n" - " .long 3f - .\n" - " .word %P1\n" /* feature bit */ - " .byte 2b - 1b\n" /* source len */ - " .byte 4f - 3f\n" /* replacement len */ - " .byte 0\n" /* pad len */ - ".previous\n" - ".section .discard,\"aw\",@progbits\n" - " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ - ".previous\n" - ".section .altinstr_replacement,\"ax\"\n" - "3: movb $1,%0\n" - "4:\n" - ".previous\n" - : "=qm" (flag) : "i" (bit)); - return flag; - -#endif /* CC_HAVE_ASM_GOTO */ -} - -#define static_cpu_has(bit) \ -( \ - __builtin_constant_p(boot_cpu_has(bit)) ? \ - boot_cpu_has(bit) : \ - __builtin_constant_p(bit) ? \ - __static_cpu_has(bit) : \ - boot_cpu_has(bit) \ -) - -static __always_inline __pure bool _static_cpu_has_safe(u16 bit) -{ -#ifdef CC_HAVE_ASM_GOTO - asm_volatile_goto("1: jmp %l[t_dynamic]\n" + asm_volatile_goto("1: jmp 6f\n" "2:\n" ".skip -(((5f-4f) - (2b-1b)) > 0) * " "((5f-4f) - (2b-1b)),0x90\n" @@ -530,66 +161,34 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) " .byte 0\n" /* repl len */ " .byte 0\n" /* pad len */ ".previous\n" - : : "i" (bit), "i" (X86_FEATURE_ALWAYS) - : : t_dynamic, t_no); + ".section .altinstr_aux,\"ax\"\n" + "6:\n" + " testb %[bitnum],%[cap_byte]\n" + " jnz %l[t_yes]\n" + " jmp %l[t_no]\n" + ".previous\n" + : : "i" (bit), "i" (X86_FEATURE_ALWAYS), + [bitnum] "i" (1 << (bit & 7)), + [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3]) + : : t_yes, t_no); + t_yes: return true; t_no: return false; - t_dynamic: - return __static_cpu_has_safe(bit); -#else - u8 flag; - /* Open-coded due to __stringify() in ALTERNATIVE() */ - asm volatile("1: movb $2,%0\n" - "2:\n" - ".section .altinstructions,\"a\"\n" - " .long 1b - .\n" /* src offset */ - " .long 3f - .\n" /* repl offset */ - " .word %P2\n" /* always replace */ - " .byte 2b - 1b\n" /* source len */ - " .byte 4f - 3f\n" /* replacement len */ - " .byte 0\n" /* pad len */ - ".previous\n" - ".section .discard,\"aw\",@progbits\n" - " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ - ".previous\n" - ".section .altinstr_replacement,\"ax\"\n" - "3: movb $0,%0\n" - "4:\n" - ".previous\n" - ".section .altinstructions,\"a\"\n" - " .long 1b - .\n" /* src offset */ - " .long 5f - .\n" /* repl offset */ - " .word %P1\n" /* feature bit */ - " .byte 4b - 3b\n" /* src len */ - " .byte 6f - 5f\n" /* repl len */ - " .byte 0\n" /* pad len */ - ".previous\n" - ".section .discard,\"aw\",@progbits\n" - " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */ - ".previous\n" - ".section .altinstr_replacement,\"ax\"\n" - "5: movb $1,%0\n" - "6:\n" - ".previous\n" - : "=qm" (flag) - : "i" (bit), "i" (X86_FEATURE_ALWAYS)); - return (flag == 2 ? __static_cpu_has_safe(bit) : flag); -#endif /* CC_HAVE_ASM_GOTO */ } -#define static_cpu_has_safe(bit) \ +#define static_cpu_has(bit) \ ( \ __builtin_constant_p(boot_cpu_has(bit)) ? \ boot_cpu_has(bit) : \ - _static_cpu_has_safe(bit) \ + _static_cpu_has(bit) \ ) #else /* - * gcc 3.x is too stupid to do the static test; fall back to dynamic. + * Fall back to dynamic for gcc versions which don't support asm goto. Should be + * a minority now anyway. */ #define static_cpu_has(bit) boot_cpu_has(bit) -#define static_cpu_has_safe(bit) boot_cpu_has(bit) #endif #define cpu_has_bug(c, bit) cpu_has(c, (bit)) @@ -597,7 +196,6 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) #define clear_cpu_bug(c, bit) clear_cpu_cap(c, (bit)) #define static_cpu_has_bug(bit) static_cpu_has((bit)) -#define static_cpu_has_bug_safe(bit) static_cpu_has_safe((bit)) #define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit)) #define MAX_CPU_FEATURES (NCAPINTS * 32) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h new file mode 100644 index 000000000000..074b7604bd51 --- /dev/null +++ b/arch/x86/include/asm/cpufeatures.h @@ -0,0 +1,300 @@ +#ifndef _ASM_X86_CPUFEATURES_H +#define _ASM_X86_CPUFEATURES_H + +#ifndef _ASM_X86_REQUIRED_FEATURES_H +#include +#endif + +#ifndef _ASM_X86_DISABLED_FEATURES_H +#include +#endif + +/* + * Defines x86 CPU feature bits + */ +#define NCAPINTS 16 /* N 32-bit words worth of info */ +#define NBUGINTS 1 /* N 32-bit bug flags */ + +/* + * Note: If the comment begins with a quoted string, that string is used + * in /proc/cpuinfo instead of the macro name. If the string is "", + * this feature bit is not displayed in /proc/cpuinfo at all. + */ + +/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */ +#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ +#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ +#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */ +#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */ +#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */ +#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */ +#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */ +#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */ +#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */ +#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */ +#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */ +#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */ +#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */ +#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */ +#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */ + /* (plus FCMOVcc, FCOMI with FPU) */ +#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */ +#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */ +#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */ +#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */ +#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */ +#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */ +#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */ +#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ +#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */ +#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */ +#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */ +#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */ +#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */ +#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */ +#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */ + +/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ +/* Don't duplicate feature flags which are redundant with Intel! */ +#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */ +#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */ +#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */ +#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */ +#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */ +#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */ +#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */ +#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */ +#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */ +#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */ + +/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */ +#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */ +#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */ +#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */ + +/* Other features, Linux-defined mapping, word 3 */ +/* This range is used for feature bits which conflict or are synthesized */ +#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */ +#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ +#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ +#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ +/* cpu types for specific tunings: */ +#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ +#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */ +#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ +#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ +#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ +#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */ +#define X86_FEATURE_ART ( 3*32+10) /* Platform has always running timer (ART) */ +#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ +#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ +#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ +#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */ +#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */ +#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */ +#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */ +#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */ +/* free, was #define X86_FEATURE_11AP ( 3*32+19) * "" Bad local APIC aka 11AP */ +#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ +#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ +#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */ +#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ +#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ +/* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */ +#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */ +#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */ +#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ +#define X86_FEATURE_EAGER_FPU ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */ +#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ +#define X86_FEATURE_MCE_RECOVERY ( 3*32+31) /* cpu has recoverable machine checks */ + +/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ +#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ +#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */ +#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */ +#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */ +#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */ +#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */ +#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */ +#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */ +#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ +#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ +#define X86_FEATURE_CID ( 4*32+10) /* Context ID */ +#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */ +#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ +#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */ +#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ +#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */ +#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */ +#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */ +#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */ +#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */ +#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */ +#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */ +#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */ +#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */ +#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */ +#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ +#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */ +#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */ +#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */ +#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */ +#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */ + +/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ +#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */ +#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */ +#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ +#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */ +#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */ +#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */ +#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */ +#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */ +#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ +#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ + +/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */ +#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ +#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */ +#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */ +#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */ +#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */ +#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */ +#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */ +#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */ +#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */ +#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */ +#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */ +#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */ +#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */ +#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */ +#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */ +#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */ +#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */ +#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */ +#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */ +#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */ +#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */ +#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ +#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ +#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */ +#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ + +/* + * Auxiliary flags: Linux defined - For features scattered in various + * CPUID levels like 0x6, 0xA etc, word 7. + * + * Reuse free bits when adding new feature flags! + */ + +#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ +#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ + +#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ +#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ + +#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ + +/* Virtualization flags: Linux defined, word 8 */ +#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ +#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ +#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */ +#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ +#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */ + +#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */ +#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ + + +/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ +#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ +#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */ +#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ +#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ +#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ +#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ +#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ +#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */ +#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ +#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ +#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ +#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ +#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ +#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ +#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ +#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ +#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ +#define X86_FEATURE_PCOMMIT ( 9*32+22) /* PCOMMIT instruction */ +#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ +#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ +#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ +#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ +#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ +#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */ +#define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */ +#define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */ + +/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */ +#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */ +#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */ +#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */ +#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */ + +/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */ +#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ + +/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */ +#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */ + +/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ +#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ + +/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */ +#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ +#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ +#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ +#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */ +#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */ +#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */ +#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */ +#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */ +#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ +#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ + +/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */ +#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ +#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ +#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ +#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */ +#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */ +#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */ +#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */ +#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */ +#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ +#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ +#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ + +/* + * BUG word(s) + */ +#define X86_BUG(x) (NCAPINTS*32 + (x)) + +#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */ +#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */ +#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */ +#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */ +#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */ +#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ +#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ +#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ +#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ + +#ifdef CONFIG_X86_32 +/* + * 64-bit kernels don't use X86_BUG_ESPFIX. Make the define conditional + * to avoid confusion. + */ +#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */ +#endif + +#endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h index 278441f39856..eb5deb42484d 100644 --- a/arch/x86/include/asm/desc_defs.h +++ b/arch/x86/include/asm/desc_defs.h @@ -98,4 +98,27 @@ struct desc_ptr { #endif /* !__ASSEMBLY__ */ +/* Access rights as returned by LAR */ +#define AR_TYPE_RODATA (0 * (1 << 9)) +#define AR_TYPE_RWDATA (1 * (1 << 9)) +#define AR_TYPE_RODATA_EXPDOWN (2 * (1 << 9)) +#define AR_TYPE_RWDATA_EXPDOWN (3 * (1 << 9)) +#define AR_TYPE_XOCODE (4 * (1 << 9)) +#define AR_TYPE_XRCODE (5 * (1 << 9)) +#define AR_TYPE_XOCODE_CONF (6 * (1 << 9)) +#define AR_TYPE_XRCODE_CONF (7 * (1 << 9)) +#define AR_TYPE_MASK (7 * (1 << 9)) + +#define AR_DPL0 (0 * (1 << 13)) +#define AR_DPL3 (3 * (1 << 13)) +#define AR_DPL_MASK (3 * (1 << 13)) + +#define AR_A (1 << 8) /* "Accessed" */ +#define AR_S (1 << 12) /* If clear, "System" segment */ +#define AR_P (1 << 15) /* "Present" */ +#define AR_AVL (1 << 20) /* "AVaiLable" (no HW effect) */ +#define AR_L (1 << 21) /* "Long mode" for code segments */ +#define AR_DB (1 << 22) /* D/B, effect depends on type */ +#define AR_G (1 << 23) /* "Granularity" (limit in pages) */ + #endif /* _ASM_X86_DESC_DEFS_H */ diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h index 535192f6bfad..3c69fed215c5 100644 --- a/arch/x86/include/asm/dmi.h +++ b/arch/x86/include/asm/dmi.h @@ -15,7 +15,7 @@ static __always_inline __init void *dmi_alloc(unsigned len) /* Use early IO mappings for DMI because it's initialized early */ #define dmi_early_remap early_ioremap #define dmi_early_unmap early_iounmap -#define dmi_remap ioremap +#define dmi_remap ioremap_cache #define dmi_unmap iounmap #endif /* _ASM_X86_DMI_H */ diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 6d7d0e52ed5a..8554f960e21b 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -138,7 +138,7 @@ extern void reserve_top_address(unsigned long reserve); extern int fixmaps_set; extern pte_t *kmap_pte; -extern pgprot_t kmap_prot; +#define kmap_prot PAGE_KERNEL extern pte_t *pkmap_page_table; void __native_set_fixmap(enum fixed_addresses idx, pte_t pte); diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 0fd440df63f1..a2124343edf5 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -17,6 +17,7 @@ #include #include #include +#include /* * High level FPU state handling functions: @@ -58,22 +59,22 @@ extern u64 fpu__get_supported_xfeatures_mask(void); */ static __always_inline __pure bool use_eager_fpu(void) { - return static_cpu_has_safe(X86_FEATURE_EAGER_FPU); + return static_cpu_has(X86_FEATURE_EAGER_FPU); } static __always_inline __pure bool use_xsaveopt(void) { - return static_cpu_has_safe(X86_FEATURE_XSAVEOPT); + return static_cpu_has(X86_FEATURE_XSAVEOPT); } static __always_inline __pure bool use_xsave(void) { - return static_cpu_has_safe(X86_FEATURE_XSAVE); + return static_cpu_has(X86_FEATURE_XSAVE); } static __always_inline __pure bool use_fxsr(void) { - return static_cpu_has_safe(X86_FEATURE_FXSR); + return static_cpu_has(X86_FEATURE_FXSR); } /* @@ -300,7 +301,7 @@ static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate) WARN_ON(system_state != SYSTEM_BOOTING); - if (static_cpu_has_safe(X86_FEATURE_XSAVES)) + if (static_cpu_has(X86_FEATURE_XSAVES)) XSTATE_OP(XSAVES, xstate, lmask, hmask, err); else XSTATE_OP(XSAVE, xstate, lmask, hmask, err); @@ -322,7 +323,7 @@ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate) WARN_ON(system_state != SYSTEM_BOOTING); - if (static_cpu_has_safe(X86_FEATURE_XSAVES)) + if (static_cpu_has(X86_FEATURE_XSAVES)) XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); else XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); @@ -460,7 +461,7 @@ static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate) * pending. Clear the x87 state here by setting it to fixed values. * "m" is a random variable that should be in L1. */ - if (unlikely(static_cpu_has_bug_safe(X86_BUG_FXSAVE_LEAK))) { + if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) { asm volatile( "fnclex\n\t" "emms\n\t" @@ -589,7 +590,8 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) * If the task has used the math, pre-load the FPU on xsave processors * or if the past 5 consecutive context-switches used math. */ - fpu.preload = new_fpu->fpstate_active && + fpu.preload = static_cpu_has(X86_FEATURE_FPU) && + new_fpu->fpstate_active && (use_eager_fpu() || new_fpu->counter > 5); if (old_fpu->fpregs_active) { diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h index 793179cf8e21..6e4d170726b7 100644 --- a/arch/x86/include/asm/frame.h +++ b/arch/x86/include/asm/frame.h @@ -1,23 +1,44 @@ -#ifdef __ASSEMBLY__ +#ifndef _ASM_X86_FRAME_H +#define _ASM_X86_FRAME_H #include -/* The annotation hides the frame from the unwinder and makes it look - like a ordinary ebp save/restore. This avoids some special cases for - frame pointer later */ -#ifdef CONFIG_FRAME_POINTER - .macro FRAME - __ASM_SIZE(push,) %__ASM_REG(bp) - __ASM_SIZE(mov) %__ASM_REG(sp), %__ASM_REG(bp) - .endm - .macro ENDFRAME - __ASM_SIZE(pop,) %__ASM_REG(bp) - .endm -#else - .macro FRAME - .endm - .macro ENDFRAME - .endm -#endif +/* + * These are stack frame creation macros. They should be used by every + * callable non-leaf asm function to make kernel stack traces more reliable. + */ -#endif /* __ASSEMBLY__ */ +#ifdef CONFIG_FRAME_POINTER + +#ifdef __ASSEMBLY__ + +.macro FRAME_BEGIN + push %_ASM_BP + _ASM_MOV %_ASM_SP, %_ASM_BP +.endm + +.macro FRAME_END + pop %_ASM_BP +.endm + +#else /* !__ASSEMBLY__ */ + +#define FRAME_BEGIN \ + "push %" _ASM_BP "\n" \ + _ASM_MOV "%" _ASM_SP ", %" _ASM_BP "\n" + +#define FRAME_END "pop %" _ASM_BP "\n" + +#endif /* __ASSEMBLY__ */ + +#define FRAME_OFFSET __ASM_SEL(4, 8) + +#else /* !CONFIG_FRAME_POINTER */ + +#define FRAME_BEGIN +#define FRAME_END +#define FRAME_OFFSET 0 + +#endif /* CONFIG_FRAME_POINTER */ + +#endif /* _ASM_X86_FRAME_H */ diff --git a/arch/x86/include/asm/imr.h b/arch/x86/include/asm/imr.h index cd2ce4068441..ebea2c9d2cdc 100644 --- a/arch/x86/include/asm/imr.h +++ b/arch/x86/include/asm/imr.h @@ -53,7 +53,7 @@ #define IMR_MASK (IMR_ALIGN - 1) int imr_add_range(phys_addr_t base, size_t size, - unsigned int rmask, unsigned int wmask, bool lock); + unsigned int rmask, unsigned int wmask); int imr_remove_range(phys_addr_t base, size_t size); diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h index cfc9a0d2d07c..a4fe16e42b7b 100644 --- a/arch/x86/include/asm/ipi.h +++ b/arch/x86/include/asm/ipi.h @@ -57,67 +57,13 @@ static inline void __xapic_wait_icr_idle(void) cpu_relax(); } -static inline void -__default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest) -{ - /* - * Subtle. In the case of the 'never do double writes' workaround - * we have to lock out interrupts to be safe. As we don't care - * of the value read we use an atomic rmw access to avoid costly - * cli/sti. Otherwise we use an even cheaper single atomic write - * to the APIC. - */ - unsigned int cfg; - - /* - * Wait for idle. - */ - __xapic_wait_icr_idle(); - - /* - * No need to touch the target chip field - */ - cfg = __prepare_ICR(shortcut, vector, dest); - - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - native_apic_mem_write(APIC_ICR, cfg); -} +void __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest); /* * This is used to send an IPI with no shorthand notation (the destination is * specified in bits 56 to 63 of the ICR). */ -static inline void - __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest) -{ - unsigned long cfg; - - /* - * Wait for idle. - */ - if (unlikely(vector == NMI_VECTOR)) - safe_apic_wait_icr_idle(); - else - __xapic_wait_icr_idle(); - - /* - * prepare target chip field - */ - cfg = __prepare_ICR2(mask); - native_apic_mem_write(APIC_ICR2, cfg); - - /* - * program the ICR - */ - cfg = __prepare_ICR(0, vector, dest); - - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - native_apic_mem_write(APIC_ICR, cfg); -} +void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest); extern void default_send_IPI_single(int cpu, int vector); extern void default_send_IPI_single_phys(int cpu, int vector); diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h index 78162f8e248b..d0afb05c84fc 100644 --- a/arch/x86/include/asm/irq_work.h +++ b/arch/x86/include/asm/irq_work.h @@ -1,7 +1,7 @@ #ifndef _ASM_IRQ_WORK_H #define _ASM_IRQ_WORK_H -#include +#include static inline bool arch_irq_work_has_interrupt(void) { diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index cfff34172be0..92b6f651fa4f 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -135,6 +135,7 @@ struct mca_config { bool ignore_ce; bool disabled; bool ser; + bool recovery; bool bios_cmci_threshold; u8 banks; s8 bootlog; diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 1e1b07a5a738..9d3a96c4da78 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -3,6 +3,7 @@ #include #include +#include #define native_rdmsr(msr, val1, val2) \ do { \ @@ -143,4 +144,29 @@ static inline void reload_early_microcode(void) { } static inline bool get_builtin_firmware(struct cpio_data *cd, const char *name) { return false; } #endif + +static inline unsigned long get_initrd_start(void) +{ +#ifdef CONFIG_BLK_DEV_INITRD + return initrd_start; +#else + return 0; +#endif +} + +static inline unsigned long get_initrd_start_addr(void) +{ +#ifdef CONFIG_BLK_DEV_INITRD +#ifdef CONFIG_X86_32 + unsigned long *initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start); + + return (unsigned long)__pa_nodebug(*initrd_start_p); +#else + return get_initrd_start(); +#endif +#else /* CONFIG_BLK_DEV_INITRD */ + return 0; +#endif +} + #endif /* _ASM_X86_MICROCODE_H */ diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h index 8559b0102ea1..603417f8dd6c 100644 --- a/arch/x86/include/asm/microcode_intel.h +++ b/arch/x86/include/asm/microcode_intel.h @@ -40,7 +40,6 @@ struct extended_sigtable { #define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) #define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) #define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) -#define DWSIZE (sizeof(u32)) #define get_totalsize(mc) \ (((struct microcode_intel *)mc)->hdr.datasize ? \ diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 55234d5e7160..1ea0baef1175 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -19,7 +19,8 @@ typedef struct { #endif struct mutex lock; - void __user *vdso; + void __user *vdso; /* vdso base address */ + const struct vdso_image *vdso_image; /* vdso image in use */ atomic_t perf_rdpmc_allowed; /* nonzero if rdpmc is allowed */ } mm_context_t; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b05402ef3b84..984ab75bf621 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -1,7 +1,12 @@ #ifndef _ASM_X86_MSR_INDEX_H #define _ASM_X86_MSR_INDEX_H -/* CPU model specific register (MSR) numbers */ +/* + * CPU model specific register (MSR) numbers. + * + * Do not add new entries to this file unless the definitions are shared + * between multiple compilation units. + */ /* x86-64 specific MSRs */ #define MSR_EFER 0xc0000080 /* extended feature register */ diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index c70689b5e5aa..0deeb2d26df7 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -3,6 +3,8 @@ #include +#include + #define MWAIT_SUBSTATE_MASK 0xf #define MWAIT_CSTATE_MASK 0xf #define MWAIT_SUBSTATE_SIZE 4 diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 813384ef811a..983738ac014c 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -13,7 +13,7 @@ struct vm86; #include #include #include -#include +#include #include #include #include @@ -24,7 +24,6 @@ struct vm86; #include #include -#include #include #include #include @@ -300,10 +299,13 @@ struct tss_struct { */ unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; +#ifdef CONFIG_X86_32 /* - * Space for the temporary SYSENTER stack: + * Space for the temporary SYSENTER stack. */ + unsigned long SYSENTER_stack_canary; unsigned long SYSENTER_stack[64]; +#endif } ____cacheline_aligned; diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index a4a77286cb1d..9b9b30b19441 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -7,12 +7,23 @@ void syscall_init(void); +#ifdef CONFIG_X86_64 void entry_SYSCALL_64(void); -void entry_SYSCALL_compat(void); +#endif + +#ifdef CONFIG_X86_32 void entry_INT80_32(void); -void entry_INT80_compat(void); void entry_SYSENTER_32(void); +void __begin_SYSENTER_singlestep_region(void); +void __end_SYSENTER_singlestep_region(void); +#endif + +#ifdef CONFIG_IA32_EMULATION void entry_SYSENTER_compat(void); +void __end_entry_SYSENTER_compat(void); +void entry_SYSCALL_compat(void); +void entry_INT80_compat(void); +#endif void x86_configure_nx(void); void x86_report_nx(void); diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h index 89db46752a8f..452c88b8ad06 100644 --- a/arch/x86/include/asm/sighandling.h +++ b/arch/x86/include/asm/sighandling.h @@ -13,7 +13,6 @@ X86_EFLAGS_CF | X86_EFLAGS_RF) void signal_fault(struct pt_regs *regs, void __user *frame, char *where); -int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc); int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, struct pt_regs *regs, unsigned long mask); diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index ba665ebd17bb..db333300bd4b 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -15,7 +15,7 @@ #include #include -#include +#include /* "Raw" instruction opcodes */ #define __ASM_CLAC .byte 0x0f,0x01,0xca diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index dfcf0727623b..20a3de5cb3b0 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -16,7 +16,6 @@ #endif #include #include -#include extern int smp_num_siblings; extern unsigned int num_processors; diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index c7b551028740..82866697fcf1 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -49,7 +49,7 @@ */ #ifndef __ASSEMBLY__ struct task_struct; -#include +#include #include struct thread_info { @@ -134,10 +134,13 @@ struct thread_info { #define _TIF_ADDR32 (1 << TIF_ADDR32) #define _TIF_X32 (1 << TIF_X32) -/* work to do in syscall_trace_enter() */ +/* + * work to do in syscall_trace_enter(). Also includes TIF_NOHZ for + * enter_from_user_mode() + */ #define _TIF_WORK_SYSCALL_ENTRY \ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \ - _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT | \ + _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \ _TIF_NOHZ) /* work to do on any return to user space */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 6df2029405a3..c24b4224d439 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -5,8 +5,57 @@ #include #include +#include #include +static inline void __invpcid(unsigned long pcid, unsigned long addr, + unsigned long type) +{ + struct { u64 d[2]; } desc = { { pcid, addr } }; + + /* + * The memory clobber is because the whole point is to invalidate + * stale TLB entries and, especially if we're flushing global + * mappings, we don't want the compiler to reorder any subsequent + * memory accesses before the TLB flush. + * + * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and + * invpcid (%rcx), %rax in long mode. + */ + asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01" + : : "m" (desc), "a" (type), "c" (&desc) : "memory"); +} + +#define INVPCID_TYPE_INDIV_ADDR 0 +#define INVPCID_TYPE_SINGLE_CTXT 1 +#define INVPCID_TYPE_ALL_INCL_GLOBAL 2 +#define INVPCID_TYPE_ALL_NON_GLOBAL 3 + +/* Flush all mappings for a given pcid and addr, not including globals. */ +static inline void invpcid_flush_one(unsigned long pcid, + unsigned long addr) +{ + __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR); +} + +/* Flush all mappings for a given PCID, not including globals. */ +static inline void invpcid_flush_single_context(unsigned long pcid) +{ + __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT); +} + +/* Flush all mappings, including globals, for all PCIDs. */ +static inline void invpcid_flush_all(void) +{ + __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL); +} + +/* Flush all mappings for all PCIDs except globals. */ +static inline void invpcid_flush_all_nonglobals(void) +{ + __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); +} + #ifdef CONFIG_PARAVIRT #include #else @@ -104,6 +153,15 @@ static inline void __native_flush_tlb_global(void) { unsigned long flags; + if (static_cpu_has(X86_FEATURE_INVPCID)) { + /* + * Using INVPCID is considerably faster than a pair of writes + * to CR4 sandwiched inside an IRQ flag save/restore. + */ + invpcid_flush_all(); + return; + } + /* * Read-modify-write to CR4 - protect it from preemption and * from interrupts. (Use the raw variant because this code can diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 6d7c5479bcea..174c4212780a 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -29,6 +29,8 @@ static inline cycles_t get_cycles(void) return rdtsc(); } +extern struct system_counterval_t convert_art_to_tsc(cycle_t art); + extern void tsc_init(void); extern void mark_tsc_unstable(char *reason); extern int unsynchronized_tsc(void); diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index b89c34c4019b..307698688fa1 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include /* diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index deabaf9759b6..43dc55be524e 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -13,9 +13,6 @@ struct vdso_image { void *data; unsigned long size; /* Always a multiple of PAGE_SIZE */ - /* text_mapping.pages is big enough for data/size page pointers */ - struct vm_special_mapping text_mapping; - unsigned long alt, alt_len; long sym_vvar_start; /* Negative offset to the vvar area */ diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index f556c4843aa1..e728699db774 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -37,6 +37,12 @@ struct vsyscall_gtod_data { }; extern struct vsyscall_gtod_data vsyscall_gtod_data; +extern int vclocks_used; +static inline bool vclock_was_used(int vclock) +{ + return READ_ONCE(vclocks_used) & (1 << vclock); +} + static inline unsigned gtod_read_begin(const struct vsyscall_gtod_data *s) { unsigned ret; diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index d485232f1e9f..62d4111c1c54 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h @@ -256,7 +256,7 @@ struct sigcontext_64 { __u16 cs; __u16 gs; __u16 fs; - __u16 __pad0; + __u16 ss; __u64 err; __u64 trapno; __u64 oldmask; @@ -341,9 +341,37 @@ struct sigcontext { __u64 rip; __u64 eflags; /* RFLAGS */ __u16 cs; + + /* + * Prior to 2.5.64 ("[PATCH] x86-64 updates for 2.5.64-bk3"), + * Linux saved and restored fs and gs in these slots. This + * was counterproductive, as fsbase and gsbase were never + * saved, so arch_prctl was presumably unreliable. + * + * These slots should never be reused without extreme caution: + * + * - Some DOSEMU versions stash fs and gs in these slots manually, + * thus overwriting anything the kernel expects to be preserved + * in these slots. + * + * - If these slots are ever needed for any other purpose, + * there is some risk that very old 64-bit binaries could get + * confused. I doubt that many such binaries still work, + * though, since the same patch in 2.5.64 also removed the + * 64-bit set_thread_area syscall, so it appears that there + * is no TLS API beyond modify_ldt that works in both pre- + * and post-2.5.64 kernels. + * + * If the kernel ever adds explicit fs, gs, fsbase, and gsbase + * save/restore, it will most likely need to be opt-in and use + * different context slots. + */ __u16 gs; __u16 fs; - __u16 __pad0; + union { + __u16 ss; /* If UC_SIGCONTEXT_SS */ + __u16 __pad0; /* Alias name for old (!UC_SIGCONTEXT_SS) user-space */ + }; __u64 err; __u64 trapno; __u64 oldmask; diff --git a/arch/x86/include/uapi/asm/ucontext.h b/arch/x86/include/uapi/asm/ucontext.h index b7c29c8017f2..e3d1ec90616e 100644 --- a/arch/x86/include/uapi/asm/ucontext.h +++ b/arch/x86/include/uapi/asm/ucontext.h @@ -1,11 +1,54 @@ #ifndef _ASM_X86_UCONTEXT_H #define _ASM_X86_UCONTEXT_H -#define UC_FP_XSTATE 0x1 /* indicates the presence of extended state - * information in the memory layout pointed - * by the fpstate pointer in the ucontext's - * sigcontext struct (uc_mcontext). - */ +/* + * Indicates the presence of extended state information in the memory + * layout pointed by the fpstate pointer in the ucontext's sigcontext + * struct (uc_mcontext). + */ +#define UC_FP_XSTATE 0x1 + +#ifdef __x86_64__ +/* + * UC_SIGCONTEXT_SS will be set when delivering 64-bit or x32 signals on + * kernels that save SS in the sigcontext. All kernels that set + * UC_SIGCONTEXT_SS will correctly restore at least the low 32 bits of esp + * regardless of SS (i.e. they implement espfix). + * + * Kernels that set UC_SIGCONTEXT_SS will also set UC_STRICT_RESTORE_SS + * when delivering a signal that came from 64-bit code. + * + * Sigreturn restores SS as follows: + * + * if (saved SS is valid || UC_STRICT_RESTORE_SS is set || + * saved CS is not 64-bit) + * new SS = saved SS (will fail IRET and signal if invalid) + * else + * new SS = a flat 32-bit data segment + * + * This behavior serves three purposes: + * + * - Legacy programs that construct a 64-bit sigcontext from scratch + * with zero or garbage in the SS slot (e.g. old CRIU) and call + * sigreturn will still work. + * + * - Old DOSEMU versions sometimes catch a signal from a segmented + * context, delete the old SS segment (with modify_ldt), and change + * the saved CS to a 64-bit segment. These DOSEMU versions expect + * sigreturn to send them back to 64-bit mode without killing them, + * despite the fact that the SS selector when the signal was raised is + * no longer valid. UC_STRICT_RESTORE_SS will be clear, so the kernel + * will fix up SS for these DOSEMU versions. + * + * - Old and new programs that catch a signal and return without + * modifying the saved context will end up in exactly the state they + * started in, even if they were running in a segmented context when + * the signal was raised.. Old kernels would lose track of the + * previous SS value. + */ +#define UC_SIGCONTEXT_SS 0x2 +#define UC_STRICT_RESTORE_SS 0x4 +#endif #include diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 9968f30cca3e..76f89e2b245a 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -53,7 +53,7 @@ void flat_init_apic_ldr(void) apic_write(APIC_LDR, val); } -static inline void _flat_send_IPI_mask(unsigned long mask, int vector) +static void _flat_send_IPI_mask(unsigned long mask, int vector) { unsigned long flags; diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index c80c02c6ec49..ab5c2c685a3c 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -30,7 +30,7 @@ static unsigned int numachip1_get_apic_id(unsigned long x) unsigned long value; unsigned int id = (x >> 24) & 0xff; - if (static_cpu_has_safe(X86_FEATURE_NODEID_MSR)) { + if (static_cpu_has(X86_FEATURE_NODEID_MSR)) { rdmsrl(MSR_FAM10H_NODE_ID, value); id |= (value << 2) & 0xff00; } @@ -178,7 +178,7 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) this_cpu_write(cpu_llc_id, node); /* Account for nodes per socket in multi-core-module processors */ - if (static_cpu_has_safe(X86_FEATURE_NODEID_MSR)) { + if (static_cpu_has(X86_FEATURE_NODEID_MSR)) { rdmsrl(MSR_FAM10H_NODE_ID, val); nodes = ((val >> 3) & 7) + 1; } diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index eb45fc9b6124..28bde88b0085 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -18,6 +18,66 @@ #include #include +void __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest) +{ + /* + * Subtle. In the case of the 'never do double writes' workaround + * we have to lock out interrupts to be safe. As we don't care + * of the value read we use an atomic rmw access to avoid costly + * cli/sti. Otherwise we use an even cheaper single atomic write + * to the APIC. + */ + unsigned int cfg; + + /* + * Wait for idle. + */ + __xapic_wait_icr_idle(); + + /* + * No need to touch the target chip field + */ + cfg = __prepare_ICR(shortcut, vector, dest); + + /* + * Send the IPI. The write to APIC_ICR fires this off. + */ + native_apic_mem_write(APIC_ICR, cfg); +} + +/* + * This is used to send an IPI with no shorthand notation (the destination is + * specified in bits 56 to 63 of the ICR). + */ +void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest) +{ + unsigned long cfg; + + /* + * Wait for idle. + */ + if (unlikely(vector == NMI_VECTOR)) + safe_apic_wait_icr_idle(); + else + __xapic_wait_icr_idle(); + + /* + * prepare target chip field + */ + cfg = __prepare_ICR2(mask); + native_apic_mem_write(APIC_ICR2, cfg); + + /* + * program the ICR + */ + cfg = __prepare_ICR(0, vector, dest); + + /* + * Send the IPI. The write to APIC_ICR fires this off. + */ + native_apic_mem_write(APIC_ICR, cfg); +} + void default_send_IPI_single_phys(int cpu, int vector) { unsigned long flags; diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 84a7524b202c..5c042466f274 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -59,7 +59,6 @@ void common(void) { #ifdef CONFIG_PARAVIRT BLANK(); - OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops); OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops); OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 6ce39025f467..ecdc1d217dc0 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -7,7 +7,7 @@ #include #include "../../../drivers/lguest/lg.h" -#define __SYSCALL_I386(nr, sym, compat) [nr] = 1, +#define __SYSCALL_I386(nr, sym, qual) [nr] = 1, static char syscalls[] = { #include }; @@ -52,6 +52,11 @@ void foo(void) DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - offsetofend(struct tss_struct, SYSENTER_stack)); + /* Offset from cpu_tss to SYSENTER_stack */ + OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack); + /* Size of SYSENTER_stack */ + DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); + #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) BLANK(); OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index f2edafb5f24e..d875f97d4e0b 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -4,17 +4,11 @@ #include -#define __SYSCALL_64(nr, sym, compat) [nr] = 1, -#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1, -#ifdef CONFIG_X86_X32_ABI -# define __SYSCALL_X32(nr, sym, compat) [nr] = 1, -#else -# define __SYSCALL_X32(nr, sym, compat) /* nothing */ -#endif +#define __SYSCALL_64(nr, sym, qual) [nr] = 1, static char syscalls_64[] = { #include }; -#define __SYSCALL_I386(nr, sym, compat) [nr] = 1, +#define __SYSCALL_I386(nr, sym, qual) [nr] = 1, static char syscalls_ia32[] = { #include }; diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 7a60424d63fa..0d373d7affc8 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -42,7 +42,7 @@ ifdef CONFIG_X86_FEATURE_NAMES quiet_cmd_mkcapflags = MKCAP $@ cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $< $@ -cpufeature = $(src)/../../include/asm/cpufeature.h +cpufeature = $(src)/../../include/asm/cpufeatures.h targets += capflags.c $(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index ce197bb7c129..1661d8ec9280 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -1,7 +1,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 81cf716f6f97..249461f95851 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -162,6 +162,22 @@ static int __init x86_mpx_setup(char *s) } __setup("nompx", x86_mpx_setup); +static int __init x86_noinvpcid_setup(char *s) +{ + /* noinvpcid doesn't accept parameters */ + if (s) + return -EINVAL; + + /* do not emit a message if the feature is not present */ + if (!boot_cpu_has(X86_FEATURE_INVPCID)) + return 0; + + setup_clear_cpu_cap(X86_FEATURE_INVPCID); + pr_info("noinvpcid: INVPCID feature disabled\n"); + return 0; +} +early_param("noinvpcid", x86_noinvpcid_setup); + #ifdef CONFIG_X86_32 static int cachesize_override = -1; static int disable_x86_serial_nr = 1; @@ -800,6 +816,31 @@ static void detect_nopl(struct cpuinfo_x86 *c) clear_cpu_cap(c, X86_FEATURE_NOPL); #else set_cpu_cap(c, X86_FEATURE_NOPL); +#endif + + /* + * ESPFIX is a strange bug. All real CPUs have it. Paravirt + * systems that run Linux at CPL > 0 may or may not have the + * issue, but, even if they have the issue, there's absolutely + * nothing we can do about it because we can't use the real IRET + * instruction. + * + * NB: For the time being, only 32-bit kernels support + * X86_BUG_ESPFIX as such. 64-bit kernels directly choose + * whether to apply espfix using paravirt hooks. If any + * non-paravirt system ever shows up that does *not* have the + * ESPFIX issue, we can change this. + */ +#ifdef CONFIG_X86_32 +#ifdef CONFIG_PARAVIRT + do { + extern void native_iret(void); + if (pv_cpu_ops.iret == native_iret) + set_cpu_bug(c, X86_BUG_ESPFIX); + } while (0); +#else + set_cpu_bug(c, X86_BUG_ESPFIX); +#endif #endif } @@ -1475,20 +1516,6 @@ void cpu_init(void) } #endif -#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS -void warn_pre_alternatives(void) -{ - WARN(1, "You're using static_cpu_has before alternatives have run!\n"); -} -EXPORT_SYMBOL_GPL(warn_pre_alternatives); -#endif - -inline bool __static_cpu_has_safe(u16 bit) -{ - return boot_cpu_has(bit); -} -EXPORT_SYMBOL_GPL(__static_cpu_has_safe); - static void bsp_resume(void) { if (this_cpu->c_bsp_resume) diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 187bb583d0df..6adef9cac23e 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "cpu.h" diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 38766c2b5b00..1f7fdb91a818 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -8,7 +8,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 6ed779efff26..de6626c18e42 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -14,7 +14,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index afa9f0d487ea..fbb5e90557a5 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -1,5 +1,5 @@ #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 524f2a8492d7..f0c921b03e42 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1578,6 +1578,17 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) if (c->x86 == 6 && c->x86_model == 45) quirk_no_way_out = quirk_sandybridge_ifu; + /* + * MCG_CAP.MCG_SER_P is necessary but not sufficient to know + * whether this processor will actually generate recoverable + * machine checks. Check to see if this is an E7 model Xeon. + * We can't do a model number check because E5 and E7 use the + * same model number. E5 doesn't support recovery, E7 does. + */ + if (mca_cfg.recovery || (mca_cfg.ser && + !strncmp(c->x86_model_id, + "Intel(R) Xeon(R) CPU E7-", 24))) + set_cpu_cap(c, X86_FEATURE_MCE_RECOVERY); } if (cfg->monarch_timeout < 0) cfg->monarch_timeout = 0; @@ -2030,6 +2041,8 @@ static int __init mcheck_enable(char *str) cfg->bootlog = (str[0] == 'b'); else if (!strcmp(str, "bios_cmci_threshold")) cfg->bios_cmci_threshold = true; + else if (!strcmp(str, "recovery")) + cfg->recovery = true; else if (isdigit(str[0])) { if (get_option(&str, &cfg->tolerant) == 2) get_option(&str, &(cfg->monarch_timeout)); diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 75d3aab5f7b2..8581963894c7 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -431,10 +431,6 @@ int __init save_microcode_in_initrd_amd(void) else container = cont_va; - if (ucode_new_rev) - pr_info("microcode: updated early to new patch_level=0x%08x\n", - ucode_new_rev); - eax = cpuid_eax(0x00000001); eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); @@ -469,8 +465,7 @@ void reload_ucode_amd(void) if (mc && rev < mc->hdr.patch_id) { if (!__apply_microcode_amd(mc)) { ucode_new_rev = mc->hdr.patch_id; - pr_info("microcode: reload patch_level=0x%08x\n", - ucode_new_rev); + pr_info("reload patch_level=0x%08x\n", ucode_new_rev); } } } @@ -793,15 +788,13 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover) return -EINVAL; } - patch->data = kzalloc(patch_size, GFP_KERNEL); + patch->data = kmemdup(fw + SECTION_HDR_SIZE, patch_size, GFP_KERNEL); if (!patch->data) { pr_err("Patch data allocation failure.\n"); kfree(patch); return -EINVAL; } - /* All looks ok, copy patch... */ - memcpy(patch->data, fw + SECTION_HDR_SIZE, patch_size); INIT_LIST_HEAD(&patch->plist); patch->patch_id = mc_hdr->patch_id; patch->equiv_cpu = proc_id; @@ -957,6 +950,10 @@ struct microcode_ops * __init init_amd_microcode(void) return NULL; } + if (ucode_new_rev) + pr_info_once("microcode updated early to new patch_level=0x%08x\n", + ucode_new_rev); + return µcode_amd_ops; } diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index faec7120c508..ac360bfbbdb6 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -43,16 +43,8 @@ #define MICROCODE_VERSION "2.01" static struct microcode_ops *microcode_ops; - static bool dis_ucode_ldr; -static int __init disable_loader(char *str) -{ - dis_ucode_ldr = true; - return 1; -} -__setup("dis_ucode_ldr", disable_loader); - /* * Synchronization. * @@ -81,15 +73,16 @@ struct cpu_info_ctx { static bool __init check_loader_disabled_bsp(void) { + static const char *__dis_opt_str = "dis_ucode_ldr"; + #ifdef CONFIG_X86_32 const char *cmdline = (const char *)__pa_nodebug(boot_command_line); - const char *opt = "dis_ucode_ldr"; - const char *option = (const char *)__pa_nodebug(opt); + const char *option = (const char *)__pa_nodebug(__dis_opt_str); bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr); #else /* CONFIG_X86_64 */ const char *cmdline = boot_command_line; - const char *option = "dis_ucode_ldr"; + const char *option = __dis_opt_str; bool *res = &dis_ucode_ldr; #endif @@ -479,7 +472,7 @@ static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw) enum ucode_state ustate; struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - if (uci && uci->valid) + if (uci->valid) return UCODE_OK; if (collect_cpu_info(cpu)) @@ -630,7 +623,7 @@ int __init microcode_init(void) struct cpuinfo_x86 *c = &boot_cpu_data; int error; - if (paravirt_enabled() || dis_ucode_ldr) + if (dis_ucode_ldr) return -EINVAL; if (c->x86_vendor == X86_VENDOR_INTEL) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index ee81c544ee0d..cbb3cf09b065 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -39,9 +39,15 @@ #include #include -static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT]; +/* + * Temporary microcode blobs pointers storage. We note here the pointers to + * microcode blobs we've got from whatever storage (detached initrd, builtin). + * Later on, we put those into final storage mc_saved_data.mc_saved. + */ +static unsigned long mc_tmp_ptrs[MAX_UCODE_COUNT]; + static struct mc_saved_data { - unsigned int mc_saved_count; + unsigned int num_saved; struct microcode_intel **mc_saved; } mc_saved_data; @@ -78,53 +84,50 @@ load_microcode_early(struct microcode_intel **saved, } static inline void -copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd, - unsigned long off, int num_saved) +copy_ptrs(struct microcode_intel **mc_saved, unsigned long *mc_ptrs, + unsigned long off, int num_saved) { int i; for (i = 0; i < num_saved; i++) - mc_saved[i] = (struct microcode_intel *)(initrd[i] + off); + mc_saved[i] = (struct microcode_intel *)(mc_ptrs[i] + off); } #ifdef CONFIG_X86_32 static void -microcode_phys(struct microcode_intel **mc_saved_tmp, - struct mc_saved_data *mc_saved_data) +microcode_phys(struct microcode_intel **mc_saved_tmp, struct mc_saved_data *mcs) { int i; struct microcode_intel ***mc_saved; - mc_saved = (struct microcode_intel ***) - __pa_nodebug(&mc_saved_data->mc_saved); - for (i = 0; i < mc_saved_data->mc_saved_count; i++) { + mc_saved = (struct microcode_intel ***)__pa_nodebug(&mcs->mc_saved); + + for (i = 0; i < mcs->num_saved; i++) { struct microcode_intel *p; - p = *(struct microcode_intel **) - __pa_nodebug(mc_saved_data->mc_saved + i); + p = *(struct microcode_intel **)__pa_nodebug(mcs->mc_saved + i); mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p); } } #endif static enum ucode_state -load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, - unsigned long initrd_start, struct ucode_cpu_info *uci) +load_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs, + unsigned long offset, struct ucode_cpu_info *uci) { struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; - unsigned int count = mc_saved_data->mc_saved_count; + unsigned int count = mcs->num_saved; - if (!mc_saved_data->mc_saved) { - copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count); + if (!mcs->mc_saved) { + copy_ptrs(mc_saved_tmp, mc_ptrs, offset, count); return load_microcode_early(mc_saved_tmp, count, uci); } else { #ifdef CONFIG_X86_32 - microcode_phys(mc_saved_tmp, mc_saved_data); + microcode_phys(mc_saved_tmp, mcs); return load_microcode_early(mc_saved_tmp, count, uci); #else - return load_microcode_early(mc_saved_data->mc_saved, - count, uci); + return load_microcode_early(mcs->mc_saved, count, uci); #endif } } @@ -175,25 +178,25 @@ matching_model_microcode(struct microcode_header_intel *mc_header, } static int -save_microcode(struct mc_saved_data *mc_saved_data, +save_microcode(struct mc_saved_data *mcs, struct microcode_intel **mc_saved_src, - unsigned int mc_saved_count) + unsigned int num_saved) { int i, j; struct microcode_intel **saved_ptr; int ret; - if (!mc_saved_count) + if (!num_saved) return -EINVAL; /* * Copy new microcode data. */ - saved_ptr = kcalloc(mc_saved_count, sizeof(struct microcode_intel *), GFP_KERNEL); + saved_ptr = kcalloc(num_saved, sizeof(struct microcode_intel *), GFP_KERNEL); if (!saved_ptr) return -ENOMEM; - for (i = 0; i < mc_saved_count; i++) { + for (i = 0; i < num_saved; i++) { struct microcode_header_intel *mc_hdr; struct microcode_intel *mc; unsigned long size; @@ -207,20 +210,18 @@ save_microcode(struct mc_saved_data *mc_saved_data, mc_hdr = &mc->hdr; size = get_totalsize(mc_hdr); - saved_ptr[i] = kmalloc(size, GFP_KERNEL); + saved_ptr[i] = kmemdup(mc, size, GFP_KERNEL); if (!saved_ptr[i]) { ret = -ENOMEM; goto err; } - - memcpy(saved_ptr[i], mc, size); } /* * Point to newly saved microcode. */ - mc_saved_data->mc_saved = saved_ptr; - mc_saved_data->mc_saved_count = mc_saved_count; + mcs->mc_saved = saved_ptr; + mcs->num_saved = num_saved; return 0; @@ -284,22 +285,20 @@ static unsigned int _save_mc(struct microcode_intel **mc_saved, * BSP can stay in the platform. */ static enum ucode_state __init -get_matching_model_microcode(int cpu, unsigned long start, - void *data, size_t size, - struct mc_saved_data *mc_saved_data, - unsigned long *mc_saved_in_initrd, +get_matching_model_microcode(unsigned long start, void *data, size_t size, + struct mc_saved_data *mcs, unsigned long *mc_ptrs, struct ucode_cpu_info *uci) { - u8 *ucode_ptr = data; - unsigned int leftover = size; - enum ucode_state state = UCODE_OK; - unsigned int mc_size; - struct microcode_header_intel *mc_header; struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; - unsigned int mc_saved_count = mc_saved_data->mc_saved_count; + struct microcode_header_intel *mc_header; + unsigned int num_saved = mcs->num_saved; + enum ucode_state state = UCODE_OK; + unsigned int leftover = size; + u8 *ucode_ptr = data; + unsigned int mc_size; int i; - while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) { + while (leftover && num_saved < ARRAY_SIZE(mc_saved_tmp)) { if (leftover < sizeof(mc_header)) break; @@ -318,32 +317,31 @@ get_matching_model_microcode(int cpu, unsigned long start, * the platform, we need to find and save microcode patches * with the same family and model as the BSP. */ - if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != - UCODE_OK) { + if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != UCODE_OK) { ucode_ptr += mc_size; continue; } - mc_saved_count = _save_mc(mc_saved_tmp, ucode_ptr, mc_saved_count); + num_saved = _save_mc(mc_saved_tmp, ucode_ptr, num_saved); ucode_ptr += mc_size; } if (leftover) { state = UCODE_ERROR; - goto out; + return state; } - if (mc_saved_count == 0) { + if (!num_saved) { state = UCODE_NFOUND; - goto out; + return state; } - for (i = 0; i < mc_saved_count; i++) - mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start; + for (i = 0; i < num_saved; i++) + mc_ptrs[i] = (unsigned long)mc_saved_tmp[i] - start; + + mcs->num_saved = num_saved; - mc_saved_data->mc_saved_count = mc_saved_count; -out: return state; } @@ -373,7 +371,7 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci) native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); csig.pf = 1 << ((val[1] >> 18) & 7); } - native_wrmsr(MSR_IA32_UCODE_REV, 0, 0); + native_wrmsrl(MSR_IA32_UCODE_REV, 0); /* As documented in the SDM: Do a CPUID 1 here */ sync_core(); @@ -396,11 +394,11 @@ static void show_saved_mc(void) unsigned int sig, pf, rev, total_size, data_size, date; struct ucode_cpu_info uci; - if (mc_saved_data.mc_saved_count == 0) { + if (!mc_saved_data.num_saved) { pr_debug("no microcode data saved.\n"); return; } - pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count); + pr_debug("Total microcode saved: %d\n", mc_saved_data.num_saved); collect_cpu_info_early(&uci); @@ -409,7 +407,7 @@ static void show_saved_mc(void) rev = uci.cpu_sig.rev; pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev); - for (i = 0; i < mc_saved_data.mc_saved_count; i++) { + for (i = 0; i < mc_saved_data.num_saved; i++) { struct microcode_header_intel *mc_saved_header; struct extended_sigtable *ext_header; int ext_sigcount; @@ -465,7 +463,7 @@ int save_mc_for_early(u8 *mc) { struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; unsigned int mc_saved_count_init; - unsigned int mc_saved_count; + unsigned int num_saved; struct microcode_intel **mc_saved; int ret = 0; int i; @@ -476,23 +474,23 @@ int save_mc_for_early(u8 *mc) */ mutex_lock(&x86_cpu_microcode_mutex); - mc_saved_count_init = mc_saved_data.mc_saved_count; - mc_saved_count = mc_saved_data.mc_saved_count; + mc_saved_count_init = mc_saved_data.num_saved; + num_saved = mc_saved_data.num_saved; mc_saved = mc_saved_data.mc_saved; - if (mc_saved && mc_saved_count) + if (mc_saved && num_saved) memcpy(mc_saved_tmp, mc_saved, - mc_saved_count * sizeof(struct microcode_intel *)); + num_saved * sizeof(struct microcode_intel *)); /* * Save the microcode patch mc in mc_save_tmp structure if it's a newer * version. */ - mc_saved_count = _save_mc(mc_saved_tmp, mc, mc_saved_count); + num_saved = _save_mc(mc_saved_tmp, mc, num_saved); /* * Save the mc_save_tmp in global mc_saved_data. */ - ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count); + ret = save_microcode(&mc_saved_data, mc_saved_tmp, num_saved); if (ret) { pr_err("Cannot save microcode patch.\n"); goto out; @@ -536,7 +534,7 @@ static bool __init load_builtin_intel_microcode(struct cpio_data *cp) static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; static __init enum ucode_state -scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, +scan_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs, unsigned long start, unsigned long size, struct ucode_cpu_info *uci) { @@ -551,14 +549,18 @@ scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, cd.data = NULL; cd.size = 0; - cd = find_cpio_data(p, (void *)start, size, &offset); - if (!cd.data) { + /* try built-in microcode if no initrd */ + if (!size) { if (!load_builtin_intel_microcode(&cd)) return UCODE_ERROR; + } else { + cd = find_cpio_data(p, (void *)start, size, &offset); + if (!cd.data) + return UCODE_ERROR; } - return get_matching_model_microcode(0, start, cd.data, cd.size, - mc_saved_data, initrd, uci); + return get_matching_model_microcode(start, cd.data, cd.size, + mcs, mc_ptrs, uci); } /* @@ -567,14 +569,11 @@ scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, static void print_ucode_info(struct ucode_cpu_info *uci, unsigned int date) { - int cpu = smp_processor_id(); - - pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n", - cpu, - uci->cpu_sig.rev, - date & 0xffff, - date >> 24, - (date >> 16) & 0xff); + pr_info_once("microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n", + uci->cpu_sig.rev, + date & 0xffff, + date >> 24, + (date >> 16) & 0xff); } #ifdef CONFIG_X86_32 @@ -603,19 +602,19 @@ void show_ucode_info_early(void) */ static void print_ucode(struct ucode_cpu_info *uci) { - struct microcode_intel *mc_intel; + struct microcode_intel *mc; int *delay_ucode_info_p; int *current_mc_date_p; - mc_intel = uci->mc; - if (mc_intel == NULL) + mc = uci->mc; + if (!mc) return; delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info); current_mc_date_p = (int *)__pa_nodebug(¤t_mc_date); *delay_ucode_info_p = 1; - *current_mc_date_p = mc_intel->hdr.date; + *current_mc_date_p = mc->hdr.date; } #else @@ -630,37 +629,35 @@ static inline void flush_tlb_early(void) static inline void print_ucode(struct ucode_cpu_info *uci) { - struct microcode_intel *mc_intel; + struct microcode_intel *mc; - mc_intel = uci->mc; - if (mc_intel == NULL) + mc = uci->mc; + if (!mc) return; - print_ucode_info(uci, mc_intel->hdr.date); + print_ucode_info(uci, mc->hdr.date); } #endif static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) { - struct microcode_intel *mc_intel; + struct microcode_intel *mc; unsigned int val[2]; - mc_intel = uci->mc; - if (mc_intel == NULL) + mc = uci->mc; + if (!mc) return 0; /* write microcode via MSR 0x79 */ - native_wrmsr(MSR_IA32_UCODE_WRITE, - (unsigned long) mc_intel->bits, - (unsigned long) mc_intel->bits >> 16 >> 16); - native_wrmsr(MSR_IA32_UCODE_REV, 0, 0); + native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); + native_wrmsrl(MSR_IA32_UCODE_REV, 0); /* As documented in the SDM: Do a CPUID 1 here */ sync_core(); /* get the current revision from MSR 0x8B */ native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - if (val[1] != mc_intel->hdr.rev) + if (val[1] != mc->hdr.rev) return -1; #ifdef CONFIG_X86_64 @@ -672,25 +669,26 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) if (early) print_ucode(uci); else - print_ucode_info(uci, mc_intel->hdr.date); + print_ucode_info(uci, mc->hdr.date); return 0; } /* * This function converts microcode patch offsets previously stored in - * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data. + * mc_tmp_ptrs to pointers and stores the pointers in mc_saved_data. */ int __init save_microcode_in_initrd_intel(void) { - unsigned int count = mc_saved_data.mc_saved_count; + unsigned int count = mc_saved_data.num_saved; struct microcode_intel *mc_saved[MAX_UCODE_COUNT]; int ret = 0; - if (count == 0) + if (!count) return ret; - copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count); + copy_ptrs(mc_saved, mc_tmp_ptrs, get_initrd_start(), count); + ret = save_microcode(&mc_saved_data, mc_saved, count); if (ret) pr_err("Cannot save microcode patches from initrd.\n"); @@ -701,8 +699,7 @@ int __init save_microcode_in_initrd_intel(void) } static void __init -_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data, - unsigned long *initrd, +_load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *mc_ptrs, unsigned long start, unsigned long size) { struct ucode_cpu_info uci; @@ -710,11 +707,11 @@ _load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data, collect_cpu_info_early(&uci); - ret = scan_microcode(mc_saved_data, initrd, start, size, &uci); + ret = scan_microcode(mcs, mc_ptrs, start, size, &uci); if (ret != UCODE_OK) return; - ret = load_microcode(mc_saved_data, initrd, start, &uci); + ret = load_microcode(mcs, mc_ptrs, start, &uci); if (ret != UCODE_OK) return; @@ -728,53 +725,49 @@ void __init load_ucode_intel_bsp(void) struct boot_params *p; p = (struct boot_params *)__pa_nodebug(&boot_params); - start = p->hdr.ramdisk_image; size = p->hdr.ramdisk_size; - _load_ucode_intel_bsp( - (struct mc_saved_data *)__pa_nodebug(&mc_saved_data), - (unsigned long *)__pa_nodebug(&mc_saved_in_initrd), - start, size); -#else - start = boot_params.hdr.ramdisk_image + PAGE_OFFSET; - size = boot_params.hdr.ramdisk_size; + /* + * Set start only if we have an initrd image. We cannot use initrd_start + * because it is not set that early yet. + */ + start = (size ? p->hdr.ramdisk_image : 0); - _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size); + _load_ucode_intel_bsp((struct mc_saved_data *)__pa_nodebug(&mc_saved_data), + (unsigned long *)__pa_nodebug(&mc_tmp_ptrs), + start, size); +#else + size = boot_params.hdr.ramdisk_size; + start = (size ? boot_params.hdr.ramdisk_image + PAGE_OFFSET : 0); + + _load_ucode_intel_bsp(&mc_saved_data, mc_tmp_ptrs, start, size); #endif } void load_ucode_intel_ap(void) { - struct mc_saved_data *mc_saved_data_p; + unsigned long *mcs_tmp_p; + struct mc_saved_data *mcs_p; struct ucode_cpu_info uci; - unsigned long *mc_saved_in_initrd_p; - unsigned long initrd_start_addr; enum ucode_state ret; #ifdef CONFIG_X86_32 - unsigned long *initrd_start_p; - mc_saved_in_initrd_p = - (unsigned long *)__pa_nodebug(mc_saved_in_initrd); - mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); - initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start); - initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p); + mcs_tmp_p = (unsigned long *)__pa_nodebug(mc_tmp_ptrs); + mcs_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); #else - mc_saved_data_p = &mc_saved_data; - mc_saved_in_initrd_p = mc_saved_in_initrd; - initrd_start_addr = initrd_start; + mcs_tmp_p = mc_tmp_ptrs; + mcs_p = &mc_saved_data; #endif /* * If there is no valid ucode previously saved in memory, no need to * update ucode on this AP. */ - if (mc_saved_data_p->mc_saved_count == 0) + if (!mcs_p->num_saved) return; collect_cpu_info_early(&uci); - ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p, - initrd_start_addr, &uci); - + ret = load_microcode(mcs_p, mcs_tmp_p, get_initrd_start_addr(), &uci); if (ret != UCODE_OK) return; @@ -786,13 +779,13 @@ void reload_ucode_intel(void) struct ucode_cpu_info uci; enum ucode_state ret; - if (!mc_saved_data.mc_saved_count) + if (!mc_saved_data.num_saved) return; collect_cpu_info_early(&uci); ret = load_microcode_early(mc_saved_data.mc_saved, - mc_saved_data.mc_saved_count, &uci); + mc_saved_data.num_saved, &uci); if (ret != UCODE_OK) return; @@ -825,7 +818,7 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) * return 0 - no update found * return 1 - found update */ -static int get_matching_mc(struct microcode_intel *mc_intel, int cpu) +static int get_matching_mc(struct microcode_intel *mc, int cpu) { struct cpu_signature cpu_sig; unsigned int csig, cpf, crev; @@ -836,39 +829,36 @@ static int get_matching_mc(struct microcode_intel *mc_intel, int cpu) cpf = cpu_sig.pf; crev = cpu_sig.rev; - return has_newer_microcode(mc_intel, csig, cpf, crev); + return has_newer_microcode(mc, csig, cpf, crev); } static int apply_microcode_intel(int cpu) { - struct microcode_intel *mc_intel; + struct microcode_intel *mc; struct ucode_cpu_info *uci; + struct cpuinfo_x86 *c; unsigned int val[2]; - int cpu_num = raw_smp_processor_id(); - struct cpuinfo_x86 *c = &cpu_data(cpu_num); - - uci = ucode_cpu_info + cpu; - mc_intel = uci->mc; /* We should bind the task to the CPU */ - BUG_ON(cpu_num != cpu); + if (WARN_ON(raw_smp_processor_id() != cpu)) + return -1; - if (mc_intel == NULL) + uci = ucode_cpu_info + cpu; + mc = uci->mc; + if (!mc) return 0; /* * Microcode on this CPU could be updated earlier. Only apply the - * microcode patch in mc_intel when it is newer than the one on this + * microcode patch in mc when it is newer than the one on this * CPU. */ - if (get_matching_mc(mc_intel, cpu) == 0) + if (!get_matching_mc(mc, cpu)) return 0; /* write microcode via MSR 0x79 */ - wrmsr(MSR_IA32_UCODE_WRITE, - (unsigned long) mc_intel->bits, - (unsigned long) mc_intel->bits >> 16 >> 16); - wrmsr(MSR_IA32_UCODE_REV, 0, 0); + wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); + wrmsrl(MSR_IA32_UCODE_REV, 0); /* As documented in the SDM: Do a CPUID 1 here */ sync_core(); @@ -876,16 +866,19 @@ static int apply_microcode_intel(int cpu) /* get the current revision from MSR 0x8B */ rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - if (val[1] != mc_intel->hdr.rev) { + if (val[1] != mc->hdr.rev) { pr_err("CPU%d update to revision 0x%x failed\n", - cpu_num, mc_intel->hdr.rev); + cpu, mc->hdr.rev); return -1; } + pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n", - cpu_num, val[1], - mc_intel->hdr.date & 0xffff, - mc_intel->hdr.date >> 24, - (mc_intel->hdr.date >> 16) & 0xff); + cpu, val[1], + mc->hdr.date & 0xffff, + mc->hdr.date >> 24, + (mc->hdr.date >> 16) & 0xff); + + c = &cpu_data(cpu); uci->cpu_sig.rev = val[1]; c->microcode = val[1]; diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c index b96896bcbdaf..2ce1a7dc45b7 100644 --- a/arch/x86/kernel/cpu/microcode/intel_lib.c +++ b/arch/x86/kernel/cpu/microcode/intel_lib.c @@ -49,7 +49,7 @@ int microcode_sanity_check(void *mc, int print_err) unsigned long total_size, data_size, ext_table_size; struct microcode_header_intel *mc_header = mc; struct extended_sigtable *ext_header = NULL; - int sum, orig_sum, ext_sigcount = 0, i; + u32 sum, orig_sum, ext_sigcount = 0, i; struct extended_signature *ext_sig; total_size = get_totalsize(mc_header); @@ -57,69 +57,85 @@ int microcode_sanity_check(void *mc, int print_err) if (data_size + MC_HEADER_SIZE > total_size) { if (print_err) - pr_err("error! Bad data size in microcode data file\n"); + pr_err("Error: bad microcode data file size.\n"); return -EINVAL; } if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { if (print_err) - pr_err("error! Unknown microcode update format\n"); + pr_err("Error: invalid/unknown microcode update format.\n"); return -EINVAL; } + ext_table_size = total_size - (MC_HEADER_SIZE + data_size); if (ext_table_size) { + u32 ext_table_sum = 0; + u32 *ext_tablep; + if ((ext_table_size < EXT_HEADER_SIZE) || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { if (print_err) - pr_err("error! Small exttable size in microcode data file\n"); + pr_err("Error: truncated extended signature table.\n"); return -EINVAL; } + ext_header = mc + MC_HEADER_SIZE + data_size; if (ext_table_size != exttable_size(ext_header)) { if (print_err) - pr_err("error! Bad exttable size in microcode data file\n"); + pr_err("Error: extended signature table size mismatch.\n"); return -EFAULT; } + ext_sigcount = ext_header->count; - } - /* check extended table checksum */ - if (ext_table_size) { - int ext_table_sum = 0; - int *ext_tablep = (int *)ext_header; + /* + * Check extended table checksum: the sum of all dwords that + * comprise a valid table must be 0. + */ + ext_tablep = (u32 *)ext_header; - i = ext_table_size / DWSIZE; + i = ext_table_size / sizeof(u32); while (i--) ext_table_sum += ext_tablep[i]; + if (ext_table_sum) { if (print_err) - pr_warn("aborting, bad extended signature table checksum\n"); + pr_warn("Bad extended signature table checksum, aborting.\n"); return -EINVAL; } } - /* calculate the checksum */ + /* + * Calculate the checksum of update data and header. The checksum of + * valid update data and header including the extended signature table + * must be 0. + */ orig_sum = 0; - i = (MC_HEADER_SIZE + data_size) / DWSIZE; + i = (MC_HEADER_SIZE + data_size) / sizeof(u32); while (i--) - orig_sum += ((int *)mc)[i]; + orig_sum += ((u32 *)mc)[i]; + if (orig_sum) { if (print_err) - pr_err("aborting, bad checksum\n"); + pr_err("Bad microcode data checksum, aborting.\n"); return -EINVAL; } + if (!ext_table_size) return 0; - /* check extended signature checksum */ + + /* + * Check extended signature checksum: 0 => valid. + */ for (i = 0; i < ext_sigcount; i++) { ext_sig = (void *)ext_header + EXT_HEADER_SIZE + EXT_SIGNATURE_SIZE * i; - sum = orig_sum - - (mc_header->sig + mc_header->pf + mc_header->cksum) - + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); + + sum = (mc_header->sig + mc_header->pf + mc_header->cksum) - + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); if (sum) { if (print_err) - pr_err("aborting, bad checksum\n"); + pr_err("Bad extended signature checksum, aborting.\n"); return -EINVAL; } } diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh index 3f20710a5b23..6988c74409a8 100644 --- a/arch/x86/kernel/cpu/mkcapflags.sh +++ b/arch/x86/kernel/cpu/mkcapflags.sh @@ -1,6 +1,6 @@ #!/bin/sh # -# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeature.h +# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeatures.h # IN=$1 @@ -49,8 +49,8 @@ dump_array() trap 'rm "$OUT"' EXIT ( - echo "#ifndef _ASM_X86_CPUFEATURE_H" - echo "#include " + echo "#ifndef _ASM_X86_CPUFEATURES_H" + echo "#include " echo "#endif" echo "" diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index ba80d68f683e..10f8d4796240 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -47,7 +47,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index e3b4d1841175..34178564be2a 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include "cpu.h" diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 837365f10912..621b501f8935 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -24,6 +24,7 @@ #include #include #include +#include /* * The e820 map is the map that gets modified e.g. with command line parameters diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index d5804adfa6da..0b1b9abd4d5f 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -114,6 +114,10 @@ void __kernel_fpu_begin(void) kernel_fpu_disable(); if (fpu->fpregs_active) { + /* + * Ignore return value -- we don't care if reg state + * is clobbered. + */ copy_fpregs_to_fpstate(fpu); } else { this_cpu_write(fpu_fpregs_owner_ctx, NULL); @@ -189,8 +193,12 @@ void fpu__save(struct fpu *fpu) preempt_disable(); if (fpu->fpregs_active) { - if (!copy_fpregs_to_fpstate(fpu)) - fpregs_deactivate(fpu); + if (!copy_fpregs_to_fpstate(fpu)) { + if (use_eager_fpu()) + copy_kernel_to_fpregs(&fpu->state); + else + fpregs_deactivate(fpu); + } } preempt_enable(); } @@ -223,14 +231,15 @@ void fpstate_init(union fpregs_state *state) } EXPORT_SYMBOL_GPL(fpstate_init); -/* - * Copy the current task's FPU state to a new task's FPU context. - * - * In both the 'eager' and the 'lazy' case we save hardware registers - * directly to the destination buffer. - */ -static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu) +int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) { + dst_fpu->counter = 0; + dst_fpu->fpregs_active = 0; + dst_fpu->last_cpu = -1; + + if (!src_fpu->fpstate_active || !cpu_has_fpu) + return 0; + WARN_ON_FPU(src_fpu != ¤t->thread.fpu); /* @@ -243,10 +252,9 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu) /* * Save current FPU registers directly into the child * FPU context, without any memory-to-memory copying. - * - * If the FPU context got destroyed in the process (FNSAVE - * done on old CPUs) then copy it back into the source - * context and mark the current task for lazy restore. + * In lazy mode, if the FPU context isn't loaded into + * fpregs, CR0.TS will be set and do_device_not_available + * will load the FPU context. * * We have to do all this with preemption disabled, * mostly because of the FNSAVE case, because in that @@ -259,19 +267,13 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu) preempt_disable(); if (!copy_fpregs_to_fpstate(dst_fpu)) { memcpy(&src_fpu->state, &dst_fpu->state, xstate_size); - fpregs_deactivate(src_fpu); + + if (use_eager_fpu()) + copy_kernel_to_fpregs(&src_fpu->state); + else + fpregs_deactivate(src_fpu); } preempt_enable(); -} - -int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) -{ - dst_fpu->counter = 0; - dst_fpu->fpregs_active = 0; - dst_fpu->last_cpu = -1; - - if (src_fpu->fpstate_active && cpu_has_fpu) - fpu_copy(dst_fpu, src_fpu); return 0; } @@ -425,7 +427,7 @@ void fpu__clear(struct fpu *fpu) { WARN_ON_FPU(fpu != ¤t->thread.fpu); /* Almost certainly an anomaly */ - if (!use_eager_fpu()) { + if (!use_eager_fpu() || !static_cpu_has(X86_FEATURE_FPU)) { /* FPU state will be reallocated lazily at the first use. */ fpu__drop(fpu); } else { diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index bd08fb77073d..54c86fffbf9f 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -262,7 +262,10 @@ static void __init fpu__init_system_xstate_size_legacy(void) * not only saved the restores along the way, but we also have the * FPU ready to be used for the original task. * - * 'eager' switching is used on modern CPUs, there we switch the FPU + * 'lazy' is deprecated because it's almost never a performance win + * and it's much more complicated than 'eager'. + * + * 'eager' switching is by default on all CPUs, there we switch the FPU * state during every context switch, regardless of whether the task * has used FPU instructions in that time slice or not. This is done * because modern FPU context saving instructions are able to optimize @@ -273,7 +276,7 @@ static void __init fpu__init_system_xstate_size_legacy(void) * to use 'eager' restores, if we detect that a task is using the FPU * frequently. See the fpu->counter logic in fpu/internal.h for that. ] */ -static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO; +static enum { ENABLE, DISABLE } eagerfpu = ENABLE; /* * Find supported xfeatures based on cpu features and command-line input. @@ -344,15 +347,9 @@ static void __init fpu__init_system_ctx_switch(void) */ static void __init fpu__init_parse_early_param(void) { - /* - * No need to check "eagerfpu=auto" again, since it is the - * initial default. - */ if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) { eagerfpu = DISABLE; fpu__clear_eager_fpu_features(); - } else if (cmdline_find_option_bool(boot_command_line, "eagerfpu=on")) { - eagerfpu = ENABLE; } if (cmdline_find_option_bool(boot_command_line, "no387")) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index d425cda5ae6d..6e8354f5a593 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -51,6 +51,9 @@ void fpu__xstate_clear_all_cpu_caps(void) setup_clear_cpu_cap(X86_FEATURE_AVX512PF); setup_clear_cpu_cap(X86_FEATURE_AVX512ER); setup_clear_cpu_cap(X86_FEATURE_AVX512CD); + setup_clear_cpu_cap(X86_FEATURE_AVX512DQ); + setup_clear_cpu_cap(X86_FEATURE_AVX512BW); + setup_clear_cpu_cap(X86_FEATURE_AVX512VL); setup_clear_cpu_cap(X86_FEATURE_MPX); setup_clear_cpu_cap(X86_FEATURE_XGETBV1); } diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 05c9e3f5b6d7..702547ce33c9 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -697,9 +697,8 @@ static inline void tramp_free(void *tramp) { } #endif /* Defined as markers to the end of the ftrace default trampolines */ -extern void ftrace_caller_end(void); extern void ftrace_regs_caller_end(void); -extern void ftrace_return(void); +extern void ftrace_epilogue(void); extern void ftrace_caller_op_ptr(void); extern void ftrace_regs_caller_op_ptr(void); @@ -746,7 +745,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) op_offset = (unsigned long)ftrace_regs_caller_op_ptr; } else { start_offset = (unsigned long)ftrace_caller; - end_offset = (unsigned long)ftrace_caller_end; + end_offset = (unsigned long)ftrace_epilogue; op_offset = (unsigned long)ftrace_caller_op_ptr; } @@ -754,7 +753,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) /* * Allocate enough size to store the ftrace_caller code, - * the jmp to ftrace_return, as well as the address of + * the jmp to ftrace_epilogue, as well as the address of * the ftrace_ops this trampoline is used for. */ trampoline = alloc_tramp(size + MCOUNT_INSN_SIZE + sizeof(void *)); @@ -772,8 +771,8 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) ip = (unsigned long)trampoline + size; - /* The trampoline ends with a jmp to ftrace_return */ - jmp = ftrace_jmp_replace(ip, (unsigned long)ftrace_return); + /* The trampoline ends with a jmp to ftrace_epilogue */ + jmp = ftrace_jmp_replace(ip, (unsigned long)ftrace_epilogue); memcpy(trampoline + size, jmp, MCOUNT_INSN_SIZE); /* diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 2c0f3407bd1f..1f4422d5c8d0 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -40,13 +40,8 @@ pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); /* Wipe all early page tables except for the kernel symbol map */ static void __init reset_early_page_tables(void) { - unsigned long i; - - for (i = 0; i < PTRS_PER_PGD-1; i++) - early_level4_pgt[i].pgd = 0; - + memset(early_level4_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1)); next_early_pgt = 0; - write_cr3(__pa_nodebug(early_level4_pgt)); } @@ -54,7 +49,6 @@ static void __init reset_early_page_tables(void) int __init early_make_pgtable(unsigned long address) { unsigned long physaddr = address - __PAGE_OFFSET; - unsigned long i; pgdval_t pgd, *pgd_p; pudval_t pud, *pud_p; pmdval_t pmd, *pmd_p; @@ -81,8 +75,7 @@ again: } pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++]; - for (i = 0; i < PTRS_PER_PUD; i++) - pud_p[i] = 0; + memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; } pud_p += pud_index(address); @@ -97,8 +90,7 @@ again: } pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++]; - for (i = 0; i < PTRS_PER_PMD; i++) - pmd_p[i] = 0; + memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; } pmd = (physaddr & PMD_MASK) + early_pmd_flags; diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 6bc9ae24b6d2..54cdbd2003fe 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include @@ -389,6 +389,12 @@ default_entry: /* Make changes effective */ wrmsr + /* + * And make sure that all the mappings we set up have NX set from + * the beginning. + */ + orl $(1 << (_PAGE_BIT_NX - 32)), pa(__supported_pte_mask + 4) + enable_paging: /* diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index ffdc0e860390..22fbf9df61bb 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -38,7 +38,6 @@ #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET) -L3_PAGE_OFFSET = pud_index(__PAGE_OFFSET) L4_START_KERNEL = pgd_index(__START_KERNEL_map) L3_START_KERNEL = pud_index(__START_KERNEL_map) @@ -76,9 +75,7 @@ startup_64: subq $_text - __START_KERNEL_map, %rbp /* Is the address not 2M aligned? */ - movq %rbp, %rax - andl $~PMD_PAGE_MASK, %eax - testl %eax, %eax + testl $~PMD_PAGE_MASK, %ebp jnz bad_address /* diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index b8e6ff5cd5d0..be0ebbb6d1d1 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -12,6 +12,7 @@ #include #include +#include #include #include #include diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S index 87e1762e2bca..ed48a9f465f8 100644 --- a/arch/x86/kernel/mcount_64.S +++ b/arch/x86/kernel/mcount_64.S @@ -168,12 +168,14 @@ GLOBAL(ftrace_call) restore_mcount_regs /* - * The copied trampoline must call ftrace_return as it + * The copied trampoline must call ftrace_epilogue as it * still may need to call the function graph tracer. + * + * The code up to this label is copied into trampolines so + * think twice before adding any new code or changing the + * layout here. */ -GLOBAL(ftrace_caller_end) - -GLOBAL(ftrace_return) +GLOBAL(ftrace_epilogue) #ifdef CONFIG_FUNCTION_GRAPH_TRACER GLOBAL(ftrace_graph_call) @@ -244,14 +246,14 @@ GLOBAL(ftrace_regs_call) popfq /* - * As this jmp to ftrace_return can be a short jump + * As this jmp to ftrace_epilogue can be a short jump * it must not be copied into the trampoline. * The trampoline will add the code to jump * to the return. */ GLOBAL(ftrace_regs_caller_end) - jmp ftrace_return + jmp ftrace_epilogue END(ftrace_regs_caller) diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 64f9616f93f1..7f3550acde1b 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -40,7 +40,7 @@ #include #include -#include +#include #include static struct class *msr_class; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 9decee2bfdbe..2915d54e9dd5 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -57,6 +57,9 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { */ .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, #endif +#ifdef CONFIG_X86_32 + .SYSENTER_stack_canary = STACK_END_MAGIC, +#endif }; EXPORT_PER_CPU_SYMBOL(cpu_tss); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index cb6282c3638f..548ddf7d6fd2 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -61,7 +61,38 @@ regs->seg = GET_SEG(seg) | 3; \ } while (0) -int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) +#ifdef CONFIG_X86_64 +/* + * If regs->ss will cause an IRET fault, change it. Otherwise leave it + * alone. Using this generally makes no sense unless + * user_64bit_mode(regs) would return true. + */ +static void force_valid_ss(struct pt_regs *regs) +{ + u32 ar; + asm volatile ("lar %[old_ss], %[ar]\n\t" + "jz 1f\n\t" /* If invalid: */ + "xorl %[ar], %[ar]\n\t" /* set ar = 0 */ + "1:" + : [ar] "=r" (ar) + : [old_ss] "rm" ((u16)regs->ss)); + + /* + * For a valid 64-bit user context, we need DPL 3, type + * read-write data or read-write exp-down data, and S and P + * set. We can't use VERW because VERW doesn't check the + * P bit. + */ + ar &= AR_DPL_MASK | AR_S | AR_P | AR_TYPE_MASK; + if (ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA) && + ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN)) + regs->ss = __USER_DS; +} +#endif + +static int restore_sigcontext(struct pt_regs *regs, + struct sigcontext __user *sc, + unsigned long uc_flags) { unsigned long buf_val; void __user *buf; @@ -94,15 +125,18 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) COPY(r15); #endif /* CONFIG_X86_64 */ -#ifdef CONFIG_X86_32 COPY_SEG_CPL3(cs); COPY_SEG_CPL3(ss); -#else /* !CONFIG_X86_32 */ - /* Kernel saves and restores only the CS segment register on signals, - * which is the bare minimum needed to allow mixed 32/64-bit code. - * App's signal handler can save/restore other segments if needed. */ - COPY_SEG_CPL3(cs); -#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_X86_64 + /* + * Fix up SS if needed for the benefit of old DOSEMU and + * CRIU. + */ + if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && + user_64bit_mode(regs))) + force_valid_ss(regs); +#endif get_user_ex(tmpflags, &sc->flags); regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); @@ -165,6 +199,7 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, put_user_ex(regs->cs, &sc->cs); put_user_ex(0, &sc->gs); put_user_ex(0, &sc->fs); + put_user_ex(regs->ss, &sc->ss); #endif /* CONFIG_X86_32 */ put_user_ex(fpstate, &sc->fpstate); @@ -403,6 +438,21 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, return 0; } #else /* !CONFIG_X86_32 */ +static unsigned long frame_uc_flags(struct pt_regs *regs) +{ + unsigned long flags; + + if (cpu_has_xsave) + flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS; + else + flags = UC_SIGCONTEXT_SS; + + if (likely(user_64bit_mode(regs))) + flags |= UC_STRICT_RESTORE_SS; + + return flags; +} + static int __setup_rt_frame(int sig, struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) { @@ -422,10 +472,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, put_user_try { /* Create the ucontext. */ - if (cpu_has_xsave) - put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); - else - put_user_ex(0, &frame->uc.uc_flags); + put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); save_altstack_ex(&frame->uc.uc_stack, regs->sp); @@ -459,10 +506,28 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, regs->sp = (unsigned long)frame; - /* Set up the CS register to run signal handlers in 64-bit mode, - even if the handler happens to be interrupting 32-bit code. */ + /* + * Set up the CS and SS registers to run signal handlers in + * 64-bit mode, even if the handler happens to be interrupting + * 32-bit or 16-bit code. + * + * SS is subtle. In 64-bit mode, we don't need any particular + * SS descriptor, but we do need SS to be valid. It's possible + * that the old SS is entirely bogus -- this can happen if the + * signal we're trying to deliver is #GP or #SS caused by a bad + * SS value. We also have a compatbility issue here: DOSEMU + * relies on the contents of the SS register indicating the + * SS value at the time of the signal, even though that code in + * DOSEMU predates sigreturn's ability to restore SS. (DOSEMU + * avoids relying on sigreturn to restore SS; instead it uses + * a trampoline.) So we do our best: if the old SS was valid, + * we keep it. Otherwise we replace it. + */ regs->cs = __USER_CS; + if (unlikely(regs->ss != __USER_DS)) + force_valid_ss(regs); + return 0; } #endif /* CONFIG_X86_32 */ @@ -489,10 +554,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig, put_user_try { /* Create the ucontext. */ - if (cpu_has_xsave) - put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); - else - put_user_ex(0, &frame->uc.uc_flags); + put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp); put_user_ex(0, &frame->uc.uc__pad0); @@ -554,7 +616,11 @@ asmlinkage unsigned long sys_sigreturn(void) set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->sc)) + /* + * x86_32 has no uc_flags bits relevant to restore_sigcontext. + * Save a few cycles by skipping the __get_user. + */ + if (restore_sigcontext(regs, &frame->sc, 0)) goto badframe; return regs->ax; @@ -570,16 +636,19 @@ asmlinkage long sys_rt_sigreturn(void) struct pt_regs *regs = current_pt_regs(); struct rt_sigframe __user *frame; sigset_t set; + unsigned long uc_flags; frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; + if (__get_user(uc_flags, &frame->uc.uc_flags)) + goto badframe; set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; if (restore_altstack(&frame->uc.uc_stack)) @@ -692,12 +761,15 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) { -#if defined(CONFIG_X86_32) || !defined(CONFIG_X86_64) +#ifdef CONFIG_X86_64 + if (is_ia32_task()) + return __NR_ia32_restart_syscall; +#endif +#ifdef CONFIG_X86_X32_ABI + return __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT); +#else return __NR_restart_syscall; -#else /* !CONFIG_X86_32 && CONFIG_X86_64 */ - return test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : - __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT); -#endif /* CONFIG_X86_32 || !CONFIG_X86_64 */ +#endif } /* @@ -763,6 +835,7 @@ asmlinkage long sys32_x32_rt_sigreturn(void) struct pt_regs *regs = current_pt_regs(); struct rt_sigframe_x32 __user *frame; sigset_t set; + unsigned long uc_flags; frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); @@ -770,10 +843,12 @@ asmlinkage long sys32_x32_rt_sigreturn(void) goto badframe; if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; + if (__get_user(uc_flags, &frame->uc.uc_flags)) + goto badframe; set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; if (compat_restore_altstack(&frame->uc.uc_stack)) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3bf1e0b5f827..643dbdccf4bc 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -256,7 +256,7 @@ static void notrace start_secondary(void *unused) x86_cpuinit.setup_percpu_clockev(); wmb(); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } int topology_update_package_map(unsigned int apicid, unsigned int cpu) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 211c11c7bba4..06cbe25861f1 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -83,32 +83,18 @@ gate_desc idt_table[NR_VECTORS] __page_aligned_bss; DECLARE_BITMAP(used_vectors, NR_VECTORS); EXPORT_SYMBOL_GPL(used_vectors); -static inline void conditional_sti(struct pt_regs *regs) +static inline void cond_local_irq_enable(struct pt_regs *regs) { if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); } -static inline void preempt_conditional_sti(struct pt_regs *regs) -{ - preempt_count_inc(); - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); -} - -static inline void conditional_cli(struct pt_regs *regs) +static inline void cond_local_irq_disable(struct pt_regs *regs) { if (regs->flags & X86_EFLAGS_IF) local_irq_disable(); } -static inline void preempt_conditional_cli(struct pt_regs *regs) -{ - if (regs->flags & X86_EFLAGS_IF) - local_irq_disable(); - preempt_count_dec(); -} - void ist_enter(struct pt_regs *regs) { if (user_mode(regs)) { @@ -262,7 +248,6 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, tsk->thread.error_code = error_code; tsk->thread.trap_nr = trapnr; -#ifdef CONFIG_X86_64 if (show_unhandled_signals && unhandled_signal(tsk, signr) && printk_ratelimit()) { pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx", @@ -271,7 +256,6 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, print_vma_addr(" in ", regs->ip); pr_cont("\n"); } -#endif force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk); } @@ -286,7 +270,7 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str, if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != NOTIFY_STOP) { - conditional_sti(regs); + cond_local_irq_enable(regs); do_trap(trapnr, signr, str, regs, error_code, fill_trap_info(regs, signr, trapnr, &info)); } @@ -368,7 +352,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) if (notify_die(DIE_TRAP, "bounds", regs, error_code, X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP) return; - conditional_sti(regs); + cond_local_irq_enable(regs); if (!user_mode(regs)) die("bounds", regs, error_code); @@ -443,7 +427,7 @@ do_general_protection(struct pt_regs *regs, long error_code) struct task_struct *tsk; RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - conditional_sti(regs); + cond_local_irq_enable(regs); if (v8086_mode(regs)) { local_irq_enable(); @@ -517,9 +501,11 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) * as we may switch to the interrupt stack. */ debug_stack_usage_inc(); - preempt_conditional_sti(regs); + preempt_disable(); + cond_local_irq_enable(regs); do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL); - preempt_conditional_cli(regs); + cond_local_irq_disable(regs); + preempt_enable_no_resched(); debug_stack_usage_dec(); exit: ist_exit(regs); @@ -571,6 +557,29 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) NOKPROBE_SYMBOL(fixup_bad_iret); #endif +static bool is_sysenter_singlestep(struct pt_regs *regs) +{ + /* + * We don't try for precision here. If we're anywhere in the region of + * code that can be single-stepped in the SYSENTER entry path, then + * assume that this is a useless single-step trap due to SYSENTER + * being invoked with TF set. (We don't know in advance exactly + * which instructions will be hit because BTF could plausibly + * be set.) + */ +#ifdef CONFIG_X86_32 + return (regs->ip - (unsigned long)__begin_SYSENTER_singlestep_region) < + (unsigned long)__end_SYSENTER_singlestep_region - + (unsigned long)__begin_SYSENTER_singlestep_region; +#elif defined(CONFIG_IA32_EMULATION) + return (regs->ip - (unsigned long)entry_SYSENTER_compat) < + (unsigned long)__end_entry_SYSENTER_compat - + (unsigned long)entry_SYSENTER_compat; +#else + return false; +#endif +} + /* * Our handling of the processor debug registers is non-trivial. * We do not clear them on entry and exit from the kernel. Therefore @@ -605,10 +614,41 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) ist_enter(regs); get_debugreg(dr6, 6); + /* + * The Intel SDM says: + * + * Certain debug exceptions may clear bits 0-3. The remaining + * contents of the DR6 register are never cleared by the + * processor. To avoid confusion in identifying debug + * exceptions, debug handlers should clear the register before + * returning to the interrupted task. + * + * Keep it simple: clear DR6 immediately. + */ + set_debugreg(0, 6); /* Filter out all the reserved bits which are preset to 1 */ dr6 &= ~DR6_RESERVED; + /* + * The SDM says "The processor clears the BTF flag when it + * generates a debug exception." Clear TIF_BLOCKSTEP to keep + * TIF_BLOCKSTEP in sync with the hardware BTF flag. + */ + clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP); + + if (unlikely(!user_mode(regs) && (dr6 & DR_STEP) && + is_sysenter_singlestep(regs))) { + dr6 &= ~DR_STEP; + if (!dr6) + goto exit; + /* + * else we might have gotten a single-step trap and hit a + * watchpoint at the same time, in which case we should fall + * through and handle the watchpoint. + */ + } + /* * If dr6 has no reason to give us about the origin of this trap, * then it's very likely the result of an icebp/int01 trap. @@ -617,18 +657,10 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) if (!dr6 && user_mode(regs)) user_icebp = 1; - /* Catch kmemcheck conditions first of all! */ + /* Catch kmemcheck conditions! */ if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) goto exit; - /* DR6 may or may not be cleared by the CPU */ - set_debugreg(0, 6); - - /* - * The processor cleared BTF, so don't mark that we need it set. - */ - clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP); - /* Store the virtualized DR6 value */ tsk->thread.debugreg6 = dr6; @@ -648,24 +680,25 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) debug_stack_usage_inc(); /* It's safe to allow irq's after DR6 has been saved */ - preempt_conditional_sti(regs); + preempt_disable(); + cond_local_irq_enable(regs); if (v8086_mode(regs)) { handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, X86_TRAP_DB); - preempt_conditional_cli(regs); + cond_local_irq_disable(regs); + preempt_enable_no_resched(); debug_stack_usage_dec(); goto exit; } - /* - * Single-stepping through system calls: ignore any exceptions in - * kernel space, but re-enable TF when returning to user mode. - * - * We already checked v86 mode above, so we can check for kernel mode - * by just checking the CPL of CS. - */ - if ((dr6 & DR_STEP) && !user_mode(regs)) { + if (WARN_ON_ONCE((dr6 & DR_STEP) && !user_mode(regs))) { + /* + * Historical junk that used to handle SYSENTER single-stepping. + * This should be unreachable now. If we survive for a while + * without anyone hitting this warning, we'll turn this into + * an oops. + */ tsk->thread.debugreg6 &= ~DR_STEP; set_tsk_thread_flag(tsk, TIF_SINGLESTEP); regs->flags &= ~X86_EFLAGS_TF; @@ -673,10 +706,19 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) si_code = get_si_code(tsk->thread.debugreg6); if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) send_sigtrap(tsk, regs, error_code, si_code); - preempt_conditional_cli(regs); + cond_local_irq_disable(regs); + preempt_enable_no_resched(); debug_stack_usage_dec(); exit: +#if defined(CONFIG_X86_32) + /* + * This is the most likely code path that involves non-trivial use + * of the SYSENTER stack. Check that we haven't overrun it. + */ + WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC, + "Overran or corrupted SYSENTER stack\n"); +#endif ist_exit(regs); } NOKPROBE_SYMBOL(do_debug); @@ -696,7 +738,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP) return; - conditional_sti(regs); + cond_local_irq_enable(regs); if (!user_mode(regs)) { if (!fixup_exception(regs, trapnr)) { @@ -743,20 +785,19 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code) dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) { - conditional_sti(regs); + cond_local_irq_enable(regs); } dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - BUG_ON(use_eager_fpu()); #ifdef CONFIG_MATH_EMULATION - if (read_cr0() & X86_CR0_EM) { + if (!boot_cpu_has(X86_FEATURE_FPU) && (read_cr0() & X86_CR0_EM)) { struct math_emu_info info = { }; - conditional_sti(regs); + cond_local_irq_enable(regs); info.regs = regs; math_emulate(&info); @@ -765,7 +806,7 @@ do_device_not_available(struct pt_regs *regs, long error_code) #endif fpu__restore(¤t->thread.fpu); /* interrupts still off */ #ifdef CONFIG_X86_32 - conditional_sti(regs); + cond_local_irq_enable(regs); #endif } NOKPROBE_SYMBOL(do_device_not_available); @@ -868,7 +909,7 @@ void __init trap_init(void) #endif #ifdef CONFIG_X86_32 - set_system_trap_gate(IA32_SYSCALL_VECTOR, entry_INT80_32); + set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_32); set_bit(IA32_SYSCALL_VECTOR, used_vectors); #endif diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 3d743da828d3..56380440d862 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -43,6 +43,11 @@ static DEFINE_STATIC_KEY_FALSE(__use_tsc); int tsc_clocksource_reliable; +static u32 art_to_tsc_numerator; +static u32 art_to_tsc_denominator; +static u64 art_to_tsc_offset; +struct clocksource *art_related_clocksource; + /* * Use a ring-buffer like data structure, where a writer advances the head by * writing a new data entry and a reader advances the tail when it observes a @@ -964,6 +969,37 @@ core_initcall(cpufreq_tsc); #endif /* CONFIG_CPU_FREQ */ +#define ART_CPUID_LEAF (0x15) +#define ART_MIN_DENOMINATOR (1) + + +/* + * If ART is present detect the numerator:denominator to convert to TSC + */ +static void detect_art(void) +{ + unsigned int unused[2]; + + if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF) + return; + + cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator, + &art_to_tsc_numerator, unused, unused+1); + + /* Don't enable ART in a VM, non-stop TSC required */ + if (boot_cpu_has(X86_FEATURE_HYPERVISOR) || + !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) || + art_to_tsc_denominator < ART_MIN_DENOMINATOR) + return; + + if (rdmsrl_safe(MSR_IA32_TSC_ADJUST, &art_to_tsc_offset)) + return; + + /* Make this sticky over multiple CPU init calls */ + setup_force_cpu_cap(X86_FEATURE_ART); +} + + /* clocksource code */ static struct clocksource clocksource_tsc; @@ -1071,6 +1107,25 @@ int unsynchronized_tsc(void) return 0; } +/* + * Convert ART to TSC given numerator/denominator found in detect_art() + */ +struct system_counterval_t convert_art_to_tsc(cycle_t art) +{ + u64 tmp, res, rem; + + rem = do_div(art, art_to_tsc_denominator); + + res = art * art_to_tsc_numerator; + tmp = rem * art_to_tsc_numerator; + + do_div(tmp, art_to_tsc_denominator); + res += tmp + art_to_tsc_offset; + + return (struct system_counterval_t) {.cs = art_related_clocksource, + .cycles = res}; +} +EXPORT_SYMBOL(convert_art_to_tsc); static void tsc_refine_calibration_work(struct work_struct *work); static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work); @@ -1142,6 +1197,8 @@ static void tsc_refine_calibration_work(struct work_struct *work) (unsigned long)tsc_khz % 1000); out: + if (boot_cpu_has(X86_FEATURE_ART)) + art_related_clocksource = &clocksource_tsc; clocksource_register_khz(&clocksource_tsc, tsc_khz); } @@ -1235,6 +1292,8 @@ void __init tsc_init(void) mark_tsc_unstable("TSCs unsynchronized"); check_system_tsc_reliable(); + + detect_art(); } #ifdef CONFIG_SMP @@ -1246,14 +1305,14 @@ void __init tsc_init(void) */ unsigned long calibrate_delay_is_known(void) { - int i, cpu = smp_processor_id(); + int sibling, cpu = smp_processor_id(); if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC)) return 0; - for_each_online_cpu(i) - if (cpu_data(i).phys_proc_id == cpu_data(cpu).phys_proc_id) - return cpu_data(i).loops_per_jiffy; + sibling = cpumask_any_but(topology_core_cpumask(cpu), cpu); + if (sibling < nr_cpu_ids) + return cpu_data(sibling).loops_per_jiffy; return 0; } #endif diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S index 07efb35ee4bc..014ea59aa153 100644 --- a/arch/x86/kernel/verify_cpu.S +++ b/arch/x86/kernel/verify_cpu.S @@ -30,7 +30,7 @@ * appropriately. Either display a message or halt. */ -#include +#include #include verify_cpu: diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index e574b8546518..3dce1ca0a653 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -362,7 +362,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) /* make room for real-mode segments */ tsk->thread.sp0 += 16; - if (static_cpu_has_safe(X86_FEATURE_SEP)) + if (static_cpu_has(X86_FEATURE_SEP)) tsk->thread.sysenter_cs = 0; load_sp0(tss, &tsk->thread); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index fe133b710bef..5af9958cbdb6 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -192,6 +192,17 @@ SECTIONS :init #endif + /* + * Section for code used exclusively before alternatives are run. All + * references to such code must be patched out by alternatives, normally + * by using X86_FEATURE_ALWAYS CPU feature bit. + * + * See static_cpu_has() for an example. + */ + .altinstr_aux : AT(ADDR(.altinstr_aux) - LOAD_OFFSET) { + *(.altinstr_aux) + } + INIT_DATA_SECTION(16) .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index a2fe51b00cce..65be7cfaf947 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -1,5 +1,5 @@ #include -#include +#include #include /* diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c index 422db000d727..5cc78bf57232 100644 --- a/arch/x86/lib/cmdline.c +++ b/arch/x86/lib/cmdline.c @@ -21,12 +21,16 @@ static inline int myisspace(u8 c) * @option: option string to look for * * Returns the position of that @option (starts counting with 1) - * or 0 on not found. + * or 0 on not found. @option will only be found if it is found + * as an entire word in @cmdline. For instance, if @option="car" + * then a cmdline which contains "cart" will not match. */ -int cmdline_find_option_bool(const char *cmdline, const char *option) +static int +__cmdline_find_option_bool(const char *cmdline, int max_cmdline_size, + const char *option) { char c; - int len, pos = 0, wstart = 0; + int pos = 0, wstart = 0; const char *opptr = NULL; enum { st_wordstart = 0, /* Start of word/after whitespace */ @@ -37,11 +41,11 @@ int cmdline_find_option_bool(const char *cmdline, const char *option) if (!cmdline) return -1; /* No command line */ - len = min_t(int, strlen(cmdline), COMMAND_LINE_SIZE); - if (!len) - return 0; - - while (len--) { + /* + * This 'pos' check ensures we do not overrun + * a non-NULL-terminated 'cmdline' + */ + while (pos < max_cmdline_size) { c = *(char *)cmdline++; pos++; @@ -58,18 +62,35 @@ int cmdline_find_option_bool(const char *cmdline, const char *option) /* fall through */ case st_wordcmp: - if (!*opptr) + if (!*opptr) { + /* + * We matched all the way to the end of the + * option we were looking for. If the + * command-line has a space _or_ ends, then + * we matched! + */ if (!c || myisspace(c)) return wstart; - else - state = st_wordskip; - else if (!c) + /* + * We hit the end of the option, but _not_ + * the end of a word on the cmdline. Not + * a match. + */ + } else if (!c) { + /* + * Hit the NULL terminator on the end of + * cmdline. + */ return 0; - else if (c != *opptr++) - state = st_wordskip; - else if (!len) /* last word and is matching */ - return wstart; - break; + } else if (c == *opptr++) { + /* + * We are currently matching, so continue + * to the next character on the cmdline. + */ + break; + } + state = st_wordskip; + /* fall through */ case st_wordskip: if (!c) @@ -82,3 +103,8 @@ int cmdline_find_option_bool(const char *cmdline, const char *option) return 0; /* Buffer overrun */ } + +int cmdline_find_option_bool(const char *cmdline, const char *option) +{ + return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option); +} diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 009f98216b7e..24ef1c2104d4 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -1,7 +1,7 @@ /* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */ #include -#include +#include #include /* diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 27f89c79a44b..2b0ef26da0bd 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 7d37641ada5b..cbb8ee5830ff 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -1,7 +1,7 @@ /* Copyright 2002 Andi Kleen */ #include -#include +#include #include /* diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index ca2afdd6d98e..90ce01bee00c 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -6,7 +6,7 @@ * - Copyright 2011 Fenghua Yu */ #include -#include +#include #include #undef memmove diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 2661fad05827..c9c81227ea37 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -1,7 +1,7 @@ /* Copyright 2002 Andi Kleen, SuSE Labs */ #include -#include +#include #include .weak memset diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 4a6f1d9b5106..99bfb192803f 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -358,20 +358,19 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, #define pgd_none(a) pud_none(__pud(pgd_val(a))) #endif -#ifdef CONFIG_X86_64 static inline bool is_hypervisor_range(int idx) { +#ifdef CONFIG_X86_64 /* * ffff800000000000 - ffff87ffffffffff is reserved for * the hypervisor. */ - return paravirt_enabled() && - (idx >= pgd_index(__PAGE_OFFSET) - 16) && - (idx < pgd_index(__PAGE_OFFSET)); -} + return (idx >= pgd_index(__PAGE_OFFSET) - 16) && + (idx < pgd_index(__PAGE_OFFSET)); #else -static inline bool is_hypervisor_range(int idx) { return false; } + return false; #endif +} static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, bool checkwx) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 2ebfbaf61142..bd7a9b9e2e14 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -388,7 +388,6 @@ repeat: } pte_t *kmap_pte; -pgprot_t kmap_prot; static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr) { @@ -405,8 +404,6 @@ static void __init kmap_init(void) */ kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); kmap_pte = kmap_get_fixmap_pte(kmap_vstart); - - kmap_prot = PAGE_KERNEL; } #ifdef CONFIG_HIGHMEM diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index a40b755c67e3..214afda97911 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include "mm_internal.h" @@ -1203,26 +1204,13 @@ int kern_addr_valid(unsigned long addr) static unsigned long probe_memory_block_size(void) { - /* start from 2g */ - unsigned long bz = 1UL<<31; + unsigned long bz = MIN_MEMORY_BLOCK_SIZE; - if (totalram_pages >= (64ULL << (30 - PAGE_SHIFT))) { - pr_info("Using 2GB memory block size for large-memory system\n"); - return 2UL * 1024 * 1024 * 1024; - } + /* if system is UV or has 64GB of RAM or more, use large blocks */ + if (is_uv_system() || ((max_pfn << PAGE_SHIFT) >= (64UL << 30))) + bz = 2UL << 30; /* 2GB */ - /* less than 64g installed */ - if ((max_pfn << PAGE_SHIFT) < (16UL << 32)) - return MIN_MEMORY_BLOCK_SIZE; - - /* get the tail size */ - while (bz > MIN_MEMORY_BLOCK_SIZE) { - if (!((max_pfn << PAGE_SHIFT) & (bz - 1))) - break; - bz >>= 1; - } - - printk(KERN_DEBUG "memory block size : %ldMB\n", bz >> 20); + pr_info("x86/mm: Memory block size: %ldMB\n", bz >> 20); return bz; } diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index d470cf219a2d..1b1110fa0057 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -120,11 +120,22 @@ void __init kasan_init(void) kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), (void *)KASAN_SHADOW_END); - memset(kasan_zero_page, 0, PAGE_SIZE); - load_cr3(init_level4_pgt); __flush_tlb_all(); - init_task.kasan_depth = 0; + /* + * kasan_zero_page has been used as early shadow memory, thus it may + * contain some garbage. Now we can clear and write protect it, since + * after the TLB flush no one should write to it. + */ + memset(kasan_zero_page, 0, PAGE_SIZE); + for (i = 0; i < PTRS_PER_PTE; i++) { + pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO); + set_pte(&kasan_zero_pte[i], pte); + } + /* Flush TLBs again to be sure that write protection applied. */ + __flush_tlb_all(); + + init_task.kasan_depth = 0; pr_info("KernelAddressSanitizer initialized\n"); } diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 637ab34ed632..ddb2244b06a1 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -33,7 +33,7 @@ struct kmmio_fault_page { struct list_head list; struct kmmio_fault_page *release_next; - unsigned long page; /* location of the fault page */ + unsigned long addr; /* the requested address */ pteval_t old_presence; /* page presence prior to arming */ bool armed; @@ -70,9 +70,16 @@ unsigned int kmmio_count; static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE]; static LIST_HEAD(kmmio_probes); -static struct list_head *kmmio_page_list(unsigned long page) +static struct list_head *kmmio_page_list(unsigned long addr) { - return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)]; + unsigned int l; + pte_t *pte = lookup_address(addr, &l); + + if (!pte) + return NULL; + addr &= page_level_mask(l); + + return &kmmio_page_table[hash_long(addr, KMMIO_PAGE_HASH_BITS)]; } /* Accessed per-cpu */ @@ -98,15 +105,19 @@ static struct kmmio_probe *get_kmmio_probe(unsigned long addr) } /* You must be holding RCU read lock. */ -static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) +static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr) { struct list_head *head; struct kmmio_fault_page *f; + unsigned int l; + pte_t *pte = lookup_address(addr, &l); - page &= PAGE_MASK; - head = kmmio_page_list(page); + if (!pte) + return NULL; + addr &= page_level_mask(l); + head = kmmio_page_list(addr); list_for_each_entry_rcu(f, head, list) { - if (f->page == page) + if (f->addr == addr) return f; } return NULL; @@ -137,10 +148,10 @@ static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old) static int clear_page_presence(struct kmmio_fault_page *f, bool clear) { unsigned int level; - pte_t *pte = lookup_address(f->page, &level); + pte_t *pte = lookup_address(f->addr, &level); if (!pte) { - pr_err("no pte for page 0x%08lx\n", f->page); + pr_err("no pte for addr 0x%08lx\n", f->addr); return -1; } @@ -156,7 +167,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear) return -1; } - __flush_tlb_one(f->page); + __flush_tlb_one(f->addr); return 0; } @@ -176,12 +187,12 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f) int ret; WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n")); if (f->armed) { - pr_warning("double-arm: page 0x%08lx, ref %d, old %d\n", - f->page, f->count, !!f->old_presence); + pr_warning("double-arm: addr 0x%08lx, ref %d, old %d\n", + f->addr, f->count, !!f->old_presence); } ret = clear_page_presence(f, true); - WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming 0x%08lx failed.\n"), - f->page); + WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming at 0x%08lx failed.\n"), + f->addr); f->armed = true; return ret; } @@ -191,7 +202,7 @@ static void disarm_kmmio_fault_page(struct kmmio_fault_page *f) { int ret = clear_page_presence(f, false); WARN_ONCE(ret < 0, - KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page); + KERN_ERR "kmmio disarming at 0x%08lx failed.\n", f->addr); f->armed = false; } @@ -215,6 +226,12 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) struct kmmio_context *ctx; struct kmmio_fault_page *faultpage; int ret = 0; /* default to fault not handled */ + unsigned long page_base = addr; + unsigned int l; + pte_t *pte = lookup_address(addr, &l); + if (!pte) + return -EINVAL; + page_base &= page_level_mask(l); /* * Preemption is now disabled to prevent process switch during @@ -227,7 +244,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) preempt_disable(); rcu_read_lock(); - faultpage = get_kmmio_fault_page(addr); + faultpage = get_kmmio_fault_page(page_base); if (!faultpage) { /* * Either this page fault is not caused by kmmio, or @@ -239,7 +256,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) ctx = &get_cpu_var(kmmio_ctx); if (ctx->active) { - if (addr == ctx->addr) { + if (page_base == ctx->addr) { /* * A second fault on the same page means some other * condition needs handling by do_page_fault(), the @@ -267,9 +284,9 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) ctx->active++; ctx->fpage = faultpage; - ctx->probe = get_kmmio_probe(addr); + ctx->probe = get_kmmio_probe(page_base); ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); - ctx->addr = addr; + ctx->addr = page_base; if (ctx->probe && ctx->probe->pre_handler) ctx->probe->pre_handler(ctx->probe, regs, addr); @@ -354,12 +371,11 @@ out: } /* You must be holding kmmio_lock. */ -static int add_kmmio_fault_page(unsigned long page) +static int add_kmmio_fault_page(unsigned long addr) { struct kmmio_fault_page *f; - page &= PAGE_MASK; - f = get_kmmio_fault_page(page); + f = get_kmmio_fault_page(addr); if (f) { if (!f->count) arm_kmmio_fault_page(f); @@ -372,26 +388,25 @@ static int add_kmmio_fault_page(unsigned long page) return -1; f->count = 1; - f->page = page; + f->addr = addr; if (arm_kmmio_fault_page(f)) { kfree(f); return -1; } - list_add_rcu(&f->list, kmmio_page_list(f->page)); + list_add_rcu(&f->list, kmmio_page_list(f->addr)); return 0; } /* You must be holding kmmio_lock. */ -static void release_kmmio_fault_page(unsigned long page, +static void release_kmmio_fault_page(unsigned long addr, struct kmmio_fault_page **release_list) { struct kmmio_fault_page *f; - page &= PAGE_MASK; - f = get_kmmio_fault_page(page); + f = get_kmmio_fault_page(addr); if (!f) return; @@ -420,18 +435,27 @@ int register_kmmio_probe(struct kmmio_probe *p) int ret = 0; unsigned long size = 0; const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); + unsigned int l; + pte_t *pte; spin_lock_irqsave(&kmmio_lock, flags); if (get_kmmio_probe(p->addr)) { ret = -EEXIST; goto out; } + + pte = lookup_address(p->addr, &l); + if (!pte) { + ret = -EINVAL; + goto out; + } + kmmio_count++; list_add_rcu(&p->list, &kmmio_probes); while (size < size_lim) { if (add_kmmio_fault_page(p->addr + size)) pr_err("Unable to set page fault.\n"); - size += PAGE_SIZE; + size += page_level_size(l); } out: spin_unlock_irqrestore(&kmmio_lock, flags); @@ -506,11 +530,17 @@ void unregister_kmmio_probe(struct kmmio_probe *p) const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); struct kmmio_fault_page *release_list = NULL; struct kmmio_delayed_release *drelease; + unsigned int l; + pte_t *pte; + + pte = lookup_address(p->addr, &l); + if (!pte) + return; spin_lock_irqsave(&kmmio_lock, flags); while (size < size_lim) { release_kmmio_fault_page(p->addr + size, &release_list); - size += PAGE_SIZE; + size += page_level_size(l); } list_del_rcu(&p->list); kmmio_count--; diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 72bb52f93c3d..d2dc0438d654 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -93,18 +93,6 @@ static unsigned long mmap_base(unsigned long rnd) return PAGE_ALIGN(TASK_SIZE - gap - rnd); } -/* - * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64 - * does, but not when emulating X86_32 - */ -static unsigned long mmap_legacy_base(unsigned long rnd) -{ - if (mmap_is_ia32()) - return TASK_UNMAPPED_BASE; - else - return TASK_UNMAPPED_BASE + rnd; -} - /* * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: @@ -116,7 +104,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (current->flags & PF_RANDOMIZE) random_factor = arch_mmap_rnd(); - mm->mmap_legacy_base = mmap_legacy_base(random_factor); + mm->mmap_legacy_base = TASK_UNMAPPED_BASE + random_factor; if (mmap_is_legacy()) { mm->mmap_base = mm->mmap_legacy_base; diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index d04f8094bc23..f70c1ff46125 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -465,46 +465,67 @@ static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) return true; } +/* + * Mark all currently memblock-reserved physical memory (which covers the + * kernel's own memory ranges) as hot-unswappable. + */ static void __init numa_clear_kernel_node_hotplug(void) { - int i, nid; - nodemask_t numa_kernel_nodes = NODE_MASK_NONE; - phys_addr_t start, end; - struct memblock_region *r; + nodemask_t reserved_nodemask = NODE_MASK_NONE; + struct memblock_region *mb_region; + int i; /* + * We have to do some preprocessing of memblock regions, to + * make them suitable for reservation. + * * At this time, all memory regions reserved by memblock are - * used by the kernel. Set the nid in memblock.reserved will - * mark out all the nodes the kernel resides in. + * used by the kernel, but those regions are not split up + * along node boundaries yet, and don't necessarily have their + * node ID set yet either. + * + * So iterate over all memory known to the x86 architecture, + * and use those ranges to set the nid in memblock.reserved. + * This will split up the memblock regions along node + * boundaries and will set the node IDs as well. */ for (i = 0; i < numa_meminfo.nr_blks; i++) { - struct numa_memblk *mb = &numa_meminfo.blk[i]; + struct numa_memblk *mb = numa_meminfo.blk + i; + int ret; - memblock_set_node(mb->start, mb->end - mb->start, - &memblock.reserved, mb->nid); + ret = memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid); + WARN_ON_ONCE(ret); } /* - * Mark all kernel nodes. + * Now go over all reserved memblock regions, to construct a + * node mask of all kernel reserved memory areas. * - * When booting with mem=nn[kMG] or in a kdump kernel, numa_meminfo - * may not include all the memblock.reserved memory ranges because - * trim_snb_memory() reserves specific pages for Sandy Bridge graphics. + * [ Note, when booting with mem=nn[kMG] or in a kdump kernel, + * numa_meminfo might not include all memblock.reserved + * memory ranges, because quirks such as trim_snb_memory() + * reserve specific pages for Sandy Bridge graphics. ] */ - for_each_memblock(reserved, r) - if (r->nid != MAX_NUMNODES) - node_set(r->nid, numa_kernel_nodes); + for_each_memblock(reserved, mb_region) { + if (mb_region->nid != MAX_NUMNODES) + node_set(mb_region->nid, reserved_nodemask); + } - /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */ + /* + * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory + * belonging to the reserved node mask. + * + * Note that this will include memory regions that reside + * on nodes that contain kernel memory - entire nodes + * become hot-unpluggable: + */ for (i = 0; i < numa_meminfo.nr_blks; i++) { - nid = numa_meminfo.blk[i].nid; - if (!node_isset(nid, numa_kernel_nodes)) + struct numa_memblk *mb = numa_meminfo.blk + i; + + if (!node_isset(mb->nid, reserved_nodemask)) continue; - start = numa_meminfo.blk[i].start; - end = numa_meminfo.blk[i].end; - - memblock_clear_hotplug(start, end - start); + memblock_clear_hotplug(mb->start, mb->end - mb->start); } } diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 1c37e650acac..007ebe2d8157 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1128,8 +1128,10 @@ static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, /* * Ignore all non primary paths. */ - if (!primary) + if (!primary) { + cpa->numpages = 1; return 0; + } /* * Ignore the NULL PTE for kernel identity mapping, as it is expected diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index f4ae536b0914..04e2e7144bee 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -943,7 +943,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, return -EINVAL; } - *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | + *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | cachemode2protval(pcm)); return 0; @@ -959,7 +959,7 @@ int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, /* Set prot based on lookup */ pcm = lookup_memtype(pfn_t_to_phys(pfn)); - *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | + *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | cachemode2protval(pcm)); return 0; diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c index 92e2eacb3321..8bea84724a7d 100644 --- a/arch/x86/mm/setup_nx.c +++ b/arch/x86/mm/setup_nx.c @@ -4,6 +4,7 @@ #include #include +#include static int disable_nx; @@ -31,9 +32,8 @@ early_param("noexec", noexec_setup); void x86_configure_nx(void) { - if (boot_cpu_has(X86_FEATURE_NX) && !disable_nx) - __supported_pte_mask |= _PAGE_NX; - else + /* If disable_nx is set, clear NX on all new mappings going forward. */ + if (disable_nx) __supported_pte_mask &= ~_PAGE_NX; } diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 50d86c0e9ba4..660a83c8287b 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -24,7 +24,6 @@ #include #include #include -#include #include "op_x86_model.h" #include "op_counter.h" diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c index 76b6632d3143..1865c196f136 100644 --- a/arch/x86/platform/geode/alix.c +++ b/arch/x86/platform/geode/alix.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include @@ -35,6 +35,11 @@ #define BIOS_SIGNATURE_COREBOOT 0x500 #define BIOS_REGION_SIZE 0x10000 +/* + * This driver is not modular, but to keep back compatibility + * with existing use cases, continuing with module_param is + * the easiest way forward. + */ static bool force = 0; module_param(force, bool, 0444); /* FIXME: Award bios is not automatically detected as Alix platform */ @@ -192,9 +197,4 @@ static int __init alix_init(void) return 0; } - -module_init(alix_init); - -MODULE_AUTHOR("Ed Wildgoose "); -MODULE_DESCRIPTION("PCEngines ALIX System Setup"); -MODULE_LICENSE("GPL"); +device_initcall(alix_init); diff --git a/arch/x86/platform/geode/geos.c b/arch/x86/platform/geode/geos.c index aa733fba2471..4fcdb91318a0 100644 --- a/arch/x86/platform/geode/geos.c +++ b/arch/x86/platform/geode/geos.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -120,9 +119,4 @@ static int __init geos_init(void) return 0; } - -module_init(geos_init); - -MODULE_AUTHOR("Philip Prindeville "); -MODULE_DESCRIPTION("Traverse Technologies Geos System Setup"); -MODULE_LICENSE("GPL"); +device_initcall(geos_init); diff --git a/arch/x86/platform/geode/net5501.c b/arch/x86/platform/geode/net5501.c index 927e38c0089f..a2f6b982a729 100644 --- a/arch/x86/platform/geode/net5501.c +++ b/arch/x86/platform/geode/net5501.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -146,9 +145,4 @@ static int __init net5501_init(void) return 0; } - -module_init(net5501_init); - -MODULE_AUTHOR("Philip Prindeville "); -MODULE_DESCRIPTION("Soekris net5501 System Setup"); -MODULE_LICENSE("GPL"); +device_initcall(net5501_init); diff --git a/arch/x86/platform/intel-mid/mfld.c b/arch/x86/platform/intel-mid/mfld.c index 23381d2174ae..1eb47b6298c2 100644 --- a/arch/x86/platform/intel-mid/mfld.c +++ b/arch/x86/platform/intel-mid/mfld.c @@ -52,10 +52,7 @@ static unsigned long __init mfld_calibrate_tsc(void) /* mark tsc clocksource as reliable */ set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE); - if (fast_calibrate) - return fast_calibrate; - - return 0; + return fast_calibrate; } static void __init penwell_arch_setup(void) diff --git a/arch/x86/platform/intel-mid/mrfl.c b/arch/x86/platform/intel-mid/mrfl.c index aaca91753d32..bd1adc621781 100644 --- a/arch/x86/platform/intel-mid/mrfl.c +++ b/arch/x86/platform/intel-mid/mrfl.c @@ -81,10 +81,7 @@ static unsigned long __init tangier_calibrate_tsc(void) /* mark tsc clocksource as reliable */ set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE); - if (fast_calibrate) - return fast_calibrate; - - return 0; + return fast_calibrate; } static void __init tangier_arch_setup(void) diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c index bfadcd0f4944..17d6d2296e4d 100644 --- a/arch/x86/platform/intel-quark/imr.c +++ b/arch/x86/platform/intel-quark/imr.c @@ -1,5 +1,5 @@ /** - * imr.c + * imr.c -- Intel Isolated Memory Region driver * * Copyright(c) 2013 Intel Corporation. * Copyright(c) 2015 Bryan O'Donoghue @@ -31,7 +31,6 @@ #include #include #include -#include #include struct imr_device { @@ -135,11 +134,9 @@ static int imr_read(struct imr_device *idev, u32 imr_id, struct imr_regs *imr) * @idev: pointer to imr_device structure. * @imr_id: IMR entry to write. * @imr: IMR structure representing address and access masks. - * @lock: indicates if the IMR lock bit should be applied. * @return: 0 on success or error code passed from mbi_iosf on failure. */ -static int imr_write(struct imr_device *idev, u32 imr_id, - struct imr_regs *imr, bool lock) +static int imr_write(struct imr_device *idev, u32 imr_id, struct imr_regs *imr) { unsigned long flags; u32 reg = imr_id * IMR_NUM_REGS + idev->reg_base; @@ -163,15 +160,6 @@ static int imr_write(struct imr_device *idev, u32 imr_id, if (ret) goto failed; - /* Lock bit must be set separately to addr_lo address bits. */ - if (lock) { - imr->addr_lo |= IMR_LOCK; - ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE, - reg - IMR_NUM_REGS, imr->addr_lo); - if (ret) - goto failed; - } - local_irq_restore(flags); return 0; failed: @@ -269,17 +257,6 @@ static int imr_debugfs_register(struct imr_device *idev) return PTR_ERR_OR_ZERO(idev->file); } -/** - * imr_debugfs_unregister - unregister debugfs hooks. - * - * @idev: pointer to imr_device structure. - * @return: - */ -static void imr_debugfs_unregister(struct imr_device *idev) -{ - debugfs_remove(idev->file); -} - /** * imr_check_params - check passed address range IMR alignment and non-zero size * @@ -334,11 +311,10 @@ static inline int imr_address_overlap(phys_addr_t addr, struct imr_regs *imr) * @size: physical size of region in bytes must be aligned to 1KiB. * @read_mask: read access mask. * @write_mask: write access mask. - * @lock: indicates whether or not to permanently lock this region. * @return: zero on success or negative value indicating error. */ int imr_add_range(phys_addr_t base, size_t size, - unsigned int rmask, unsigned int wmask, bool lock) + unsigned int rmask, unsigned int wmask) { phys_addr_t end; unsigned int i; @@ -411,7 +387,7 @@ int imr_add_range(phys_addr_t base, size_t size, imr.rmask = rmask; imr.wmask = wmask; - ret = imr_write(idev, reg, &imr, lock); + ret = imr_write(idev, reg, &imr); if (ret < 0) { /* * In the highly unlikely event iosf_mbi_write failed @@ -422,7 +398,7 @@ int imr_add_range(phys_addr_t base, size_t size, imr.addr_hi = 0; imr.rmask = IMR_READ_ACCESS_ALL; imr.wmask = IMR_WRITE_ACCESS_ALL; - imr_write(idev, reg, &imr, false); + imr_write(idev, reg, &imr); } failed: mutex_unlock(&idev->lock); @@ -518,7 +494,7 @@ static int __imr_remove_range(int reg, phys_addr_t base, size_t size) imr.rmask = IMR_READ_ACCESS_ALL; imr.wmask = IMR_WRITE_ACCESS_ALL; - ret = imr_write(idev, reg, &imr, false); + ret = imr_write(idev, reg, &imr); failed: mutex_unlock(&idev->lock); @@ -599,7 +575,7 @@ static void __init imr_fixup_memmap(struct imr_device *idev) * We don't round up @size since it is already PAGE_SIZE aligned. * See vmlinux.lds.S for details. */ - ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false); + ret = imr_add_range(base, size, IMR_CPU, IMR_CPU); if (ret < 0) { pr_err("unable to setup IMR for kernel: %zu KiB (%lx - %lx)\n", size / 1024, start, end); @@ -614,7 +590,6 @@ static const struct x86_cpu_id imr_ids[] __initconst = { { X86_VENDOR_INTEL, 5, 9 }, /* Intel Quark SoC X1000. */ {} }; -MODULE_DEVICE_TABLE(x86cpu, imr_ids); /** * imr_init - entry point for IMR driver. @@ -640,22 +615,4 @@ static int __init imr_init(void) imr_fixup_memmap(idev); return 0; } - -/** - * imr_exit - exit point for IMR code. - * - * Deregisters debugfs, leave IMR state as-is. - * - * return: - */ -static void __exit imr_exit(void) -{ - imr_debugfs_unregister(&imr_dev); -} - -module_init(imr_init); -module_exit(imr_exit); - -MODULE_AUTHOR("Bryan O'Donoghue "); -MODULE_DESCRIPTION("Intel Isolated Memory Region driver"); -MODULE_LICENSE("Dual BSD/GPL"); +device_initcall(imr_init); diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c index 278e4da4222f..f5bad40936ac 100644 --- a/arch/x86/platform/intel-quark/imr_selftest.c +++ b/arch/x86/platform/intel-quark/imr_selftest.c @@ -1,5 +1,5 @@ /** - * imr_selftest.c + * imr_selftest.c -- Intel Isolated Memory Region self-test driver * * Copyright(c) 2013 Intel Corporation. * Copyright(c) 2015 Bryan O'Donoghue @@ -15,7 +15,6 @@ #include #include #include -#include #include #define SELFTEST KBUILD_MODNAME ": " @@ -61,30 +60,30 @@ static void __init imr_self_test(void) int ret; /* Test zero zero. */ - ret = imr_add_range(0, 0, 0, 0, false); + ret = imr_add_range(0, 0, 0, 0); imr_self_test_result(ret < 0, "zero sized IMR\n"); /* Test exact overlap. */ - ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false); + ret = imr_add_range(base, size, IMR_CPU, IMR_CPU); imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size)); /* Test overlap with base inside of existing. */ base += size - IMR_ALIGN; - ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false); + ret = imr_add_range(base, size, IMR_CPU, IMR_CPU); imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size)); /* Test overlap with end inside of existing. */ base -= size + IMR_ALIGN * 2; - ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false); + ret = imr_add_range(base, size, IMR_CPU, IMR_CPU); imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size)); /* Test that a 1 KiB IMR @ zero with read/write all will bomb out. */ ret = imr_add_range(0, IMR_ALIGN, IMR_READ_ACCESS_ALL, - IMR_WRITE_ACCESS_ALL, false); + IMR_WRITE_ACCESS_ALL); imr_self_test_result(ret < 0, "1KiB IMR @ 0x00000000 - access-all\n"); /* Test that a 1 KiB IMR @ zero with CPU only will work. */ - ret = imr_add_range(0, IMR_ALIGN, IMR_CPU, IMR_CPU, false); + ret = imr_add_range(0, IMR_ALIGN, IMR_CPU, IMR_CPU); imr_self_test_result(ret >= 0, "1KiB IMR @ 0x00000000 - cpu-access\n"); if (ret >= 0) { ret = imr_remove_range(0, IMR_ALIGN); @@ -93,8 +92,7 @@ static void __init imr_self_test(void) /* Test 2 KiB works. */ size = IMR_ALIGN * 2; - ret = imr_add_range(0, size, IMR_READ_ACCESS_ALL, - IMR_WRITE_ACCESS_ALL, false); + ret = imr_add_range(0, size, IMR_READ_ACCESS_ALL, IMR_WRITE_ACCESS_ALL); imr_self_test_result(ret >= 0, "2KiB IMR @ 0x00000000\n"); if (ret >= 0) { ret = imr_remove_range(0, size); @@ -106,7 +104,6 @@ static const struct x86_cpu_id imr_ids[] __initconst = { { X86_VENDOR_INTEL, 5, 9 }, /* Intel Quark SoC X1000. */ {} }; -MODULE_DEVICE_TABLE(x86cpu, imr_ids); /** * imr_self_test_init - entry point for IMR driver. @@ -125,13 +122,4 @@ static int __init imr_self_test_init(void) * * return: */ -static void __exit imr_self_test_exit(void) -{ -} - -module_init(imr_self_test_init); -module_exit(imr_self_test_exit); - -MODULE_AUTHOR("Bryan O'Donoghue "); -MODULE_DESCRIPTION("Intel Isolated Memory Region self-test driver"); -MODULE_LICENSE("Dual BSD/GPL"); +device_initcall(imr_self_test_init); diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h index 174781a404ff..00c319048d52 100644 --- a/arch/x86/um/asm/barrier.h +++ b/arch/x86/um/asm/barrier.h @@ -3,7 +3,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index 439c0994b696..bfce503dffae 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -25,11 +25,11 @@ #define old_mmap sys_old_mmap -#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; +#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include #undef __SYSCALL_I386 -#define __SYSCALL_I386(nr, sym, compat) [ nr ] = sym, +#define __SYSCALL_I386(nr, sym, qual) [ nr ] = sym, extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index b74ea6c2c0e7..f306413d3eb6 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -35,14 +35,11 @@ #define stub_execveat sys_execveat #define stub_rt_sigreturn sys_rt_sigreturn -#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) -#define __SYSCALL_X32(nr, sym, compat) /* Not supported */ - -#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; +#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include #undef __SYSCALL_64 -#define __SYSCALL_64(nr, sym, compat) [ nr ] = sym, +#define __SYSCALL_64(nr, sym, qual) [ nr ] = sym, extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c index ce7e3607a870..470564bbd08e 100644 --- a/arch/x86/um/user-offsets.c +++ b/arch/x86/um/user-offsets.c @@ -9,14 +9,12 @@ #include #ifdef __i386__ -#define __SYSCALL_I386(nr, sym, compat) [nr] = 1, +#define __SYSCALL_I386(nr, sym, qual) [nr] = 1, static char syscalls[] = { #include }; #else -#define __SYSCALL_64(nr, sym, compat) [nr] = 1, -#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1, -#define __SYSCALL_X32(nr, sym, compat) /* Not supported */ +#define __SYSCALL_64(nr, sym, qual) [nr] = 1, static char syscalls[] = { #include }; diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 3f4ebf0261f2..3c6d17fd423a 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -112,7 +112,7 @@ asmlinkage __visible void cpu_bringup_and_idle(int cpu) xen_pvh_secondary_vcpu_init(cpu); #endif cpu_bringup(); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } static void xen_smp_intr_free(unsigned int cpu) diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c index 4d02e38514f5..fc4ad21a5ed4 100644 --- a/arch/xtensa/kernel/smp.c +++ b/arch/xtensa/kernel/smp.c @@ -157,7 +157,7 @@ void secondary_start_kernel(void) complete(&cpu_running); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } static void mx_cpu_start(void *p) diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index 33db7406c0e2..c346be650892 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -160,6 +160,7 @@ config CLKSRC_EFM32 config CLKSRC_LPC32XX bool "Clocksource for LPC32XX" if COMPILE_TEST depends on GENERIC_CLOCKEVENTS && HAS_IOMEM + depends on ARM select CLKSRC_MMIO select CLKSRC_OF help diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index c64d543d64bf..f0dd9d42bc7b 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -32,6 +32,14 @@ #define CNTTIDR 0x08 #define CNTTIDR_VIRT(n) (BIT(1) << ((n) * 4)) +#define CNTACR(n) (0x40 + ((n) * 4)) +#define CNTACR_RPCT BIT(0) +#define CNTACR_RVCT BIT(1) +#define CNTACR_RFRQ BIT(2) +#define CNTACR_RVOFF BIT(3) +#define CNTACR_RWVT BIT(4) +#define CNTACR_RWPT BIT(5) + #define CNTVCT_LO 0x08 #define CNTVCT_HI 0x0c #define CNTFRQ 0x10 @@ -266,10 +274,12 @@ static void __arch_timer_setup(unsigned type, if (arch_timer_use_virtual) { clk->irq = arch_timer_ppi[VIRT_PPI]; clk->set_state_shutdown = arch_timer_shutdown_virt; + clk->set_state_oneshot_stopped = arch_timer_shutdown_virt; clk->set_next_event = arch_timer_set_next_event_virt; } else { clk->irq = arch_timer_ppi[PHYS_SECURE_PPI]; clk->set_state_shutdown = arch_timer_shutdown_phys; + clk->set_state_oneshot_stopped = arch_timer_shutdown_phys; clk->set_next_event = arch_timer_set_next_event_phys; } } else { @@ -279,10 +289,12 @@ static void __arch_timer_setup(unsigned type, clk->cpumask = cpu_all_mask; if (arch_timer_mem_use_virtual) { clk->set_state_shutdown = arch_timer_shutdown_virt_mem; + clk->set_state_oneshot_stopped = arch_timer_shutdown_virt_mem; clk->set_next_event = arch_timer_set_next_event_virt_mem; } else { clk->set_state_shutdown = arch_timer_shutdown_phys_mem; + clk->set_state_oneshot_stopped = arch_timer_shutdown_phys_mem; clk->set_next_event = arch_timer_set_next_event_phys_mem; } @@ -757,7 +769,6 @@ static void __init arch_timer_mem_init(struct device_node *np) } cnttidr = readl_relaxed(cntctlbase + CNTTIDR); - iounmap(cntctlbase); /* * Try to find a virtual capable frame. Otherwise fall back to a @@ -765,20 +776,31 @@ static void __init arch_timer_mem_init(struct device_node *np) */ for_each_available_child_of_node(np, frame) { int n; + u32 cntacr; if (of_property_read_u32(frame, "frame-number", &n)) { pr_err("arch_timer: Missing frame-number\n"); - of_node_put(best_frame); of_node_put(frame); - return; + goto out; } - if (cnttidr & CNTTIDR_VIRT(n)) { + /* Try enabling everything, and see what sticks */ + cntacr = CNTACR_RFRQ | CNTACR_RWPT | CNTACR_RPCT | + CNTACR_RWVT | CNTACR_RVOFF | CNTACR_RVCT; + writel_relaxed(cntacr, cntctlbase + CNTACR(n)); + cntacr = readl_relaxed(cntctlbase + CNTACR(n)); + + if ((cnttidr & CNTTIDR_VIRT(n)) && + !(~cntacr & (CNTACR_RWVT | CNTACR_RVCT))) { of_node_put(best_frame); best_frame = frame; arch_timer_mem_use_virtual = true; break; } + + if (~cntacr & (CNTACR_RWPT | CNTACR_RPCT)) + continue; + of_node_put(best_frame); best_frame = of_node_get(frame); } @@ -786,24 +808,26 @@ static void __init arch_timer_mem_init(struct device_node *np) base = arch_counter_base = of_iomap(best_frame, 0); if (!base) { pr_err("arch_timer: Can't map frame's registers\n"); - of_node_put(best_frame); - return; + goto out; } if (arch_timer_mem_use_virtual) irq = irq_of_parse_and_map(best_frame, 1); else irq = irq_of_parse_and_map(best_frame, 0); - of_node_put(best_frame); + if (!irq) { pr_err("arch_timer: Frame missing %s irq", arch_timer_mem_use_virtual ? "virt" : "phys"); - return; + goto out; } arch_timer_detect_rate(base, np); arch_timer_mem_register(base, irq); arch_timer_common_init(); +out: + iounmap(cntctlbase); + of_node_put(best_frame); } CLOCKSOURCE_OF_DECLARE(armv7_arch_timer_mem, "arm,armv7-timer-mem", arch_timer_mem_init); diff --git a/drivers/clocksource/arm_global_timer.c b/drivers/clocksource/arm_global_timer.c index d189d8cb69f7..9df0d1699d22 100644 --- a/drivers/clocksource/arm_global_timer.c +++ b/drivers/clocksource/arm_global_timer.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -174,6 +175,7 @@ static int gt_clockevents_init(struct clock_event_device *clk) clk->set_state_shutdown = gt_clockevent_shutdown; clk->set_state_periodic = gt_clockevent_set_periodic; clk->set_state_oneshot = gt_clockevent_shutdown; + clk->set_state_oneshot_stopped = gt_clockevent_shutdown; clk->set_next_event = gt_clockevent_set_next_event; clk->cpumask = cpumask_of(cpu); clk->rating = 300; @@ -221,6 +223,21 @@ static u64 notrace gt_sched_clock_read(void) } #endif +static unsigned long gt_read_long(void) +{ + return readl_relaxed(gt_base + GT_COUNTER0); +} + +static struct delay_timer gt_delay_timer = { + .read_current_timer = gt_read_long, +}; + +static void __init gt_delay_timer_init(void) +{ + gt_delay_timer.freq = gt_clk_rate; + register_current_timer_delay(>_delay_timer); +} + static void __init gt_clocksource_init(void) { writel(0, gt_base + GT_CONTROL); @@ -317,6 +334,7 @@ static void __init global_timer_of_register(struct device_node *np) /* Immediately configure the timer on the boot CPU */ gt_clocksource_init(); gt_clockevents_init(this_cpu_ptr(gt_evt)); + gt_delay_timer_init(); return; diff --git a/drivers/clocksource/exynos_mct.c b/drivers/clocksource/exynos_mct.c index ff44082a0827..be09bc0b5e26 100644 --- a/drivers/clocksource/exynos_mct.c +++ b/drivers/clocksource/exynos_mct.c @@ -313,6 +313,7 @@ static struct clock_event_device mct_comp_device = { .set_state_periodic = mct_set_state_periodic, .set_state_shutdown = mct_set_state_shutdown, .set_state_oneshot = mct_set_state_shutdown, + .set_state_oneshot_stopped = mct_set_state_shutdown, .tick_resume = mct_set_state_shutdown, }; @@ -452,6 +453,7 @@ static int exynos4_local_timer_setup(struct mct_clock_event_device *mevt) evt->set_state_periodic = set_state_periodic; evt->set_state_shutdown = set_state_shutdown; evt->set_state_oneshot = set_state_shutdown; + evt->set_state_oneshot_stopped = set_state_shutdown; evt->tick_resume = set_state_shutdown; evt->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT; evt->rating = 450; diff --git a/drivers/clocksource/rockchip_timer.c b/drivers/clocksource/rockchip_timer.c index 8c77a529d0d4..b991b288c803 100644 --- a/drivers/clocksource/rockchip_timer.c +++ b/drivers/clocksource/rockchip_timer.c @@ -122,23 +122,23 @@ static void __init rk_timer_init(struct device_node *np) pclk = of_clk_get_by_name(np, "pclk"); if (IS_ERR(pclk)) { pr_err("Failed to get pclk for '%s'\n", TIMER_NAME); - return; + goto out_unmap; } if (clk_prepare_enable(pclk)) { pr_err("Failed to enable pclk for '%s'\n", TIMER_NAME); - return; + goto out_unmap; } timer_clk = of_clk_get_by_name(np, "timer"); if (IS_ERR(timer_clk)) { pr_err("Failed to get timer clock for '%s'\n", TIMER_NAME); - return; + goto out_timer_clk; } if (clk_prepare_enable(timer_clk)) { pr_err("Failed to enable timer clock\n"); - return; + goto out_timer_clk; } bc_timer.freq = clk_get_rate(timer_clk); @@ -146,7 +146,7 @@ static void __init rk_timer_init(struct device_node *np) irq = irq_of_parse_and_map(np, 0); if (!irq) { pr_err("Failed to map interrupts for '%s'\n", TIMER_NAME); - return; + goto out_irq; } ce->name = TIMER_NAME; @@ -164,10 +164,19 @@ static void __init rk_timer_init(struct device_node *np) ret = request_irq(irq, rk_timer_interrupt, IRQF_TIMER, TIMER_NAME, ce); if (ret) { pr_err("Failed to initialize '%s': %d\n", TIMER_NAME, ret); - return; + goto out_irq; } clockevents_config_and_register(ce, bc_timer.freq, 1, UINT_MAX); + + return; + +out_irq: + clk_disable_unprepare(timer_clk); +out_timer_clk: + clk_disable_unprepare(pclk); +out_unmap: + iounmap(bc_timer.base); } CLOCKSOURCE_OF_DECLARE(rk_timer, "rockchip,rk3288-timer", rk_timer_init); diff --git a/drivers/clocksource/time-lpc32xx.c b/drivers/clocksource/time-lpc32xx.c index 1316876b487a..daae61e8c820 100644 --- a/drivers/clocksource/time-lpc32xx.c +++ b/drivers/clocksource/time-lpc32xx.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -43,6 +44,7 @@ struct lpc32xx_clock_event_ddata { struct clock_event_device evtdev; void __iomem *base; + u32 ticks_per_jiffy; }; /* Needed for the sched clock */ @@ -53,6 +55,15 @@ static u64 notrace lpc32xx_read_sched_clock(void) return readl(clocksource_timer_counter); } +static unsigned long lpc32xx_delay_timer_read(void) +{ + return readl(clocksource_timer_counter); +} + +static struct delay_timer lpc32xx_delay_timer = { + .read_current_timer = lpc32xx_delay_timer_read, +}; + static int lpc32xx_clkevt_next_event(unsigned long delta, struct clock_event_device *evtdev) { @@ -60,14 +71,13 @@ static int lpc32xx_clkevt_next_event(unsigned long delta, container_of(evtdev, struct lpc32xx_clock_event_ddata, evtdev); /* - * Place timer in reset and program the delta in the prescale - * register (PR). When the prescale counter matches the value - * in PR the counter register is incremented and the compare - * match will trigger. After setup the timer is released from - * reset and enabled. + * Place timer in reset and program the delta in the match + * channel 0 (MR0). When the timer counter matches the value + * in MR0 register the match will trigger an interrupt. + * After setup the timer is released from reset and enabled. */ writel_relaxed(LPC32XX_TIMER_TCR_CRST, ddata->base + LPC32XX_TIMER_TCR); - writel_relaxed(delta, ddata->base + LPC32XX_TIMER_PR); + writel_relaxed(delta, ddata->base + LPC32XX_TIMER_MR0); writel_relaxed(LPC32XX_TIMER_TCR_CEN, ddata->base + LPC32XX_TIMER_TCR); return 0; @@ -86,11 +96,39 @@ static int lpc32xx_clkevt_shutdown(struct clock_event_device *evtdev) static int lpc32xx_clkevt_oneshot(struct clock_event_device *evtdev) { + struct lpc32xx_clock_event_ddata *ddata = + container_of(evtdev, struct lpc32xx_clock_event_ddata, evtdev); + /* * When using oneshot, we must also disable the timer * to wait for the first call to set_next_event(). */ - return lpc32xx_clkevt_shutdown(evtdev); + writel_relaxed(0, ddata->base + LPC32XX_TIMER_TCR); + + /* Enable interrupt, reset on match and stop on match (MCR). */ + writel_relaxed(LPC32XX_TIMER_MCR_MR0I | LPC32XX_TIMER_MCR_MR0R | + LPC32XX_TIMER_MCR_MR0S, ddata->base + LPC32XX_TIMER_MCR); + return 0; +} + +static int lpc32xx_clkevt_periodic(struct clock_event_device *evtdev) +{ + struct lpc32xx_clock_event_ddata *ddata = + container_of(evtdev, struct lpc32xx_clock_event_ddata, evtdev); + + /* Enable interrupt and reset on match. */ + writel_relaxed(LPC32XX_TIMER_MCR_MR0I | LPC32XX_TIMER_MCR_MR0R, + ddata->base + LPC32XX_TIMER_MCR); + + /* + * Place timer in reset and program the delta in the match + * channel 0 (MR0). + */ + writel_relaxed(LPC32XX_TIMER_TCR_CRST, ddata->base + LPC32XX_TIMER_TCR); + writel_relaxed(ddata->ticks_per_jiffy, ddata->base + LPC32XX_TIMER_MR0); + writel_relaxed(LPC32XX_TIMER_TCR_CEN, ddata->base + LPC32XX_TIMER_TCR); + + return 0; } static irqreturn_t lpc32xx_clock_event_handler(int irq, void *dev_id) @@ -108,11 +146,13 @@ static irqreturn_t lpc32xx_clock_event_handler(int irq, void *dev_id) static struct lpc32xx_clock_event_ddata lpc32xx_clk_event_ddata = { .evtdev = { .name = "lpc3220 clockevent", - .features = CLOCK_EVT_FEAT_ONESHOT, + .features = CLOCK_EVT_FEAT_ONESHOT | + CLOCK_EVT_FEAT_PERIODIC, .rating = 300, .set_next_event = lpc32xx_clkevt_next_event, .set_state_shutdown = lpc32xx_clkevt_shutdown, .set_state_oneshot = lpc32xx_clkevt_oneshot, + .set_state_periodic = lpc32xx_clkevt_periodic, }, }; @@ -162,6 +202,8 @@ static int __init lpc32xx_clocksource_init(struct device_node *np) } clocksource_timer_counter = base + LPC32XX_TIMER_TC; + lpc32xx_delay_timer.freq = rate; + register_current_timer_delay(&lpc32xx_delay_timer); sched_clock_register(lpc32xx_read_sched_clock, 32, rate); return 0; @@ -210,18 +252,16 @@ static int __init lpc32xx_clockevent_init(struct device_node *np) /* * Disable timer and clear any pending interrupt (IR) on match - * channel 0 (MR0). Configure a compare match value of 1 on MR0 - * and enable interrupt, reset on match and stop on match (MCR). + * channel 0 (MR0). Clear the prescaler as it's not used. */ writel_relaxed(0, base + LPC32XX_TIMER_TCR); + writel_relaxed(0, base + LPC32XX_TIMER_PR); writel_relaxed(0, base + LPC32XX_TIMER_CTCR); writel_relaxed(LPC32XX_TIMER_IR_MR0INT, base + LPC32XX_TIMER_IR); - writel_relaxed(1, base + LPC32XX_TIMER_MR0); - writel_relaxed(LPC32XX_TIMER_MCR_MR0I | LPC32XX_TIMER_MCR_MR0R | - LPC32XX_TIMER_MCR_MR0S, base + LPC32XX_TIMER_MCR); rate = clk_get_rate(clk); lpc32xx_clk_event_ddata.base = base; + lpc32xx_clk_event_ddata.ticks_per_jiffy = DIV_ROUND_CLOSEST(rate, HZ); clockevents_config_and_register(&lpc32xx_clk_event_ddata.evtdev, rate, 1, -1); diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index cd83d477e32d..3a4b39afc0ab 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1431,7 +1431,7 @@ static int __init intel_pstate_init(void) if (!all_cpu_data) return -ENOMEM; - if (static_cpu_has_safe(X86_FEATURE_HWP) && !no_hwp) { + if (static_cpu_has(X86_FEATURE_HWP) && !no_hwp) { pr_info("intel_pstate: HWP enabled\n"); hwp_active++; } diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index fb50911b3940..7e8c441ff2de 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -60,6 +60,17 @@ config ARM_VIC_NR The maximum number of VICs available in the system, for power management. +config ARMADA_370_XP_IRQ + bool + select GENERIC_IRQ_CHIP + select PCI_MSI_IRQ_DOMAIN if PCI_MSI + +config ALPINE_MSI + bool + depends on PCI && PCI_MSI + select GENERIC_IRQ_CHIP + select PCI_MSI_IRQ_DOMAIN + config ATMEL_AIC_IRQ bool select GENERIC_IRQ_CHIP @@ -78,6 +89,11 @@ config I8259 bool select IRQ_DOMAIN +config BCM6345_L1_IRQ + bool + select GENERIC_IRQ_CHIP + select IRQ_DOMAIN + config BCM7038_L1_IRQ bool select GENERIC_IRQ_CHIP @@ -151,6 +167,11 @@ config ST_IRQCHIP help Enables SysCfg Controlled IRQs on STi based platforms. +config TANGO_IRQ + bool + select IRQ_DOMAIN + select GENERIC_IRQ_CHIP + config TB10X_IRQC bool select IRQ_DOMAIN @@ -160,6 +181,7 @@ config TS4800_IRQ tristate "TS-4800 IRQ controller" select IRQ_DOMAIN depends on HAS_IOMEM + depends on SOC_IMX51 || COMPILE_TEST help Support for the TS-4800 FPGA IRQ controller @@ -193,6 +215,8 @@ config KEYSTONE_IRQ config MIPS_GIC bool + select GENERIC_IRQ_IPI + select IRQ_DOMAIN_HIERARCHY select MIPS_CM config INGENIC_IRQ @@ -218,3 +242,7 @@ config IRQ_MXS def_bool y if MACH_ASM9260 || ARCH_MXS select IRQ_DOMAIN select STMP_DEVICE + +config MVEBU_ODMI + bool + select GENERIC_MSI_IRQ_DOMAIN diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile index 18caacb60d58..b03cfcbbac6b 100644 --- a/drivers/irqchip/Makefile +++ b/drivers/irqchip/Makefile @@ -1,11 +1,13 @@ obj-$(CONFIG_IRQCHIP) += irqchip.o +obj-$(CONFIG_ALPINE_MSI) += irq-alpine-msi.o +obj-$(CONFIG_ATH79) += irq-ath79-cpu.o +obj-$(CONFIG_ATH79) += irq-ath79-misc.o obj-$(CONFIG_ARCH_BCM2835) += irq-bcm2835.o obj-$(CONFIG_ARCH_BCM2835) += irq-bcm2836.o obj-$(CONFIG_ARCH_EXYNOS) += exynos-combiner.o obj-$(CONFIG_ARCH_HIP04) += irq-hip04.o obj-$(CONFIG_ARCH_MMP) += irq-mmp.o -obj-$(CONFIG_ARCH_MVEBU) += irq-armada-370-xp.o obj-$(CONFIG_IRQ_MXS) += irq-mxs.o obj-$(CONFIG_ARCH_TEGRA) += irq-tegra.o obj-$(CONFIG_ARCH_S3C24XX) += irq-s3c24xx.o @@ -28,6 +30,7 @@ obj-$(CONFIG_ARM_GIC_V3_ITS) += irq-gic-v3-its.o irq-gic-v3-its-pci-msi.o irq-g obj-$(CONFIG_HISILICON_IRQ_MBIGEN) += irq-mbigen.o obj-$(CONFIG_ARM_NVIC) += irq-nvic.o obj-$(CONFIG_ARM_VIC) += irq-vic.o +obj-$(CONFIG_ARMADA_370_XP_IRQ) += irq-armada-370-xp.o obj-$(CONFIG_ATMEL_AIC_IRQ) += irq-atmel-aic-common.o irq-atmel-aic.o obj-$(CONFIG_ATMEL_AIC5_IRQ) += irq-atmel-aic-common.o irq-atmel-aic5.o obj-$(CONFIG_I8259) += irq-i8259.o @@ -40,12 +43,14 @@ obj-$(CONFIG_VERSATILE_FPGA_IRQ) += irq-versatile-fpga.o obj-$(CONFIG_ARCH_NSPIRE) += irq-zevio.o obj-$(CONFIG_ARCH_VT8500) += irq-vt8500.o obj-$(CONFIG_ST_IRQCHIP) += irq-st.o +obj-$(CONFIG_TANGO_IRQ) += irq-tango.o obj-$(CONFIG_TB10X_IRQC) += irq-tb10x.o obj-$(CONFIG_TS4800_IRQ) += irq-ts4800.o obj-$(CONFIG_XTENSA) += irq-xtensa-pic.o obj-$(CONFIG_XTENSA_MX) += irq-xtensa-mx.o obj-$(CONFIG_IRQ_CROSSBAR) += irq-crossbar.o obj-$(CONFIG_SOC_VF610) += irq-vf610-mscm-ir.o +obj-$(CONFIG_BCM6345_L1_IRQ) += irq-bcm6345-l1.o obj-$(CONFIG_BCM7038_L1_IRQ) += irq-bcm7038-l1.o obj-$(CONFIG_BCM7120_L2_IRQ) += irq-bcm7120-l2.o obj-$(CONFIG_BRCMSTB_L2_IRQ) += irq-brcmstb-l2.o @@ -59,3 +64,4 @@ obj-$(CONFIG_ARCH_SA1100) += irq-sa11x0.o obj-$(CONFIG_INGENIC_IRQ) += irq-ingenic.o obj-$(CONFIG_IMX_GPCV2) += irq-imx-gpcv2.o obj-$(CONFIG_PIC32_EVIC) += irq-pic32-evic.o +obj-$(CONFIG_MVEBU_ODMI) += irq-mvebu-odmi.o diff --git a/drivers/irqchip/irq-alpine-msi.c b/drivers/irqchip/irq-alpine-msi.c new file mode 100644 index 000000000000..25384255b30f --- /dev/null +++ b/drivers/irqchip/irq-alpine-msi.c @@ -0,0 +1,293 @@ +/* + * Annapurna Labs MSIX support services + * + * Copyright (C) 2016, Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Antoine Tenart + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* MSIX message address format: local GIC target */ +#define ALPINE_MSIX_SPI_TARGET_CLUSTER0 BIT(16) + +struct alpine_msix_data { + spinlock_t msi_map_lock; + phys_addr_t addr; + u32 spi_first; /* The SGI number that MSIs start */ + u32 num_spis; /* The number of SGIs for MSIs */ + unsigned long *msi_map; +}; + +static void alpine_msix_mask_msi_irq(struct irq_data *d) +{ + pci_msi_mask_irq(d); + irq_chip_mask_parent(d); +} + +static void alpine_msix_unmask_msi_irq(struct irq_data *d) +{ + pci_msi_unmask_irq(d); + irq_chip_unmask_parent(d); +} + +static struct irq_chip alpine_msix_irq_chip = { + .name = "MSIx", + .irq_mask = alpine_msix_mask_msi_irq, + .irq_unmask = alpine_msix_unmask_msi_irq, + .irq_eoi = irq_chip_eoi_parent, + .irq_set_affinity = irq_chip_set_affinity_parent, +}; + +static int alpine_msix_allocate_sgi(struct alpine_msix_data *priv, int num_req) +{ + int first; + + spin_lock(&priv->msi_map_lock); + + first = bitmap_find_next_zero_area(priv->msi_map, priv->num_spis, 0, + num_req, 0); + if (first >= priv->num_spis) { + spin_unlock(&priv->msi_map_lock); + return -ENOSPC; + } + + bitmap_set(priv->msi_map, first, num_req); + + spin_unlock(&priv->msi_map_lock); + + return priv->spi_first + first; +} + +static void alpine_msix_free_sgi(struct alpine_msix_data *priv, unsigned sgi, + int num_req) +{ + int first = sgi - priv->spi_first; + + spin_lock(&priv->msi_map_lock); + + bitmap_clear(priv->msi_map, first, num_req); + + spin_unlock(&priv->msi_map_lock); +} + +static void alpine_msix_compose_msi_msg(struct irq_data *data, + struct msi_msg *msg) +{ + struct alpine_msix_data *priv = irq_data_get_irq_chip_data(data); + phys_addr_t msg_addr = priv->addr; + + msg_addr |= (data->hwirq << 3); + + msg->address_hi = upper_32_bits(msg_addr); + msg->address_lo = lower_32_bits(msg_addr); + msg->data = 0; +} + +static struct msi_domain_info alpine_msix_domain_info = { + .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_PCI_MSIX, + .chip = &alpine_msix_irq_chip, +}; + +static struct irq_chip middle_irq_chip = { + .name = "alpine_msix_middle", + .irq_mask = irq_chip_mask_parent, + .irq_unmask = irq_chip_unmask_parent, + .irq_eoi = irq_chip_eoi_parent, + .irq_set_affinity = irq_chip_set_affinity_parent, + .irq_compose_msi_msg = alpine_msix_compose_msi_msg, +}; + +static int alpine_msix_gic_domain_alloc(struct irq_domain *domain, + unsigned int virq, int sgi) +{ + struct irq_fwspec fwspec; + struct irq_data *d; + int ret; + + if (!is_of_node(domain->parent->fwnode)) + return -EINVAL; + + fwspec.fwnode = domain->parent->fwnode; + fwspec.param_count = 3; + fwspec.param[0] = 0; + fwspec.param[1] = sgi; + fwspec.param[2] = IRQ_TYPE_EDGE_RISING; + + ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &fwspec); + if (ret) + return ret; + + d = irq_domain_get_irq_data(domain->parent, virq); + d->chip->irq_set_type(d, IRQ_TYPE_EDGE_RISING); + + return 0; +} + +static int alpine_msix_middle_domain_alloc(struct irq_domain *domain, + unsigned int virq, + unsigned int nr_irqs, void *args) +{ + struct alpine_msix_data *priv = domain->host_data; + int sgi, err, i; + + sgi = alpine_msix_allocate_sgi(priv, nr_irqs); + if (sgi < 0) + return sgi; + + for (i = 0; i < nr_irqs; i++) { + err = alpine_msix_gic_domain_alloc(domain, virq + i, sgi + i); + if (err) + goto err_sgi; + + irq_domain_set_hwirq_and_chip(domain, virq + i, sgi + i, + &middle_irq_chip, priv); + } + + return 0; + +err_sgi: + while (--i >= 0) + irq_domain_free_irqs_parent(domain, virq, i); + alpine_msix_free_sgi(priv, sgi, nr_irqs); + return err; +} + +static void alpine_msix_middle_domain_free(struct irq_domain *domain, + unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_data *d = irq_domain_get_irq_data(domain, virq); + struct alpine_msix_data *priv = irq_data_get_irq_chip_data(d); + + irq_domain_free_irqs_parent(domain, virq, nr_irqs); + alpine_msix_free_sgi(priv, d->hwirq, nr_irqs); +} + +static const struct irq_domain_ops alpine_msix_middle_domain_ops = { + .alloc = alpine_msix_middle_domain_alloc, + .free = alpine_msix_middle_domain_free, +}; + +static int alpine_msix_init_domains(struct alpine_msix_data *priv, + struct device_node *node) +{ + struct irq_domain *middle_domain, *msi_domain, *gic_domain; + struct device_node *gic_node; + + gic_node = of_irq_find_parent(node); + if (!gic_node) { + pr_err("Failed to find the GIC node\n"); + return -ENODEV; + } + + gic_domain = irq_find_host(gic_node); + if (!gic_domain) { + pr_err("Failed to find the GIC domain\n"); + return -ENXIO; + } + + middle_domain = irq_domain_add_tree(NULL, + &alpine_msix_middle_domain_ops, + priv); + if (!middle_domain) { + pr_err("Failed to create the MSIX middle domain\n"); + return -ENOMEM; + } + + middle_domain->parent = gic_domain; + + msi_domain = pci_msi_create_irq_domain(of_node_to_fwnode(node), + &alpine_msix_domain_info, + middle_domain); + if (!msi_domain) { + pr_err("Failed to create MSI domain\n"); + irq_domain_remove(middle_domain); + return -ENOMEM; + } + + return 0; +} + +static int alpine_msix_init(struct device_node *node, + struct device_node *parent) +{ + struct alpine_msix_data *priv; + struct resource res; + int ret; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + spin_lock_init(&priv->msi_map_lock); + + ret = of_address_to_resource(node, 0, &res); + if (ret) { + pr_err("Failed to allocate resource\n"); + goto err_priv; + } + + /* + * The 20 least significant bits of addr provide direct information + * regarding the interrupt destination. + * + * To select the primary GIC as the target GIC, bits [18:17] must be set + * to 0x0. In this case, bit 16 (SPI_TARGET_CLUSTER0) must be set. + */ + priv->addr = res.start & GENMASK_ULL(63,20); + priv->addr |= ALPINE_MSIX_SPI_TARGET_CLUSTER0; + + if (of_property_read_u32(node, "al,msi-base-spi", &priv->spi_first)) { + pr_err("Unable to parse MSI base\n"); + ret = -EINVAL; + goto err_priv; + } + + if (of_property_read_u32(node, "al,msi-num-spis", &priv->num_spis)) { + pr_err("Unable to parse MSI numbers\n"); + ret = -EINVAL; + goto err_priv; + } + + priv->msi_map = kzalloc(sizeof(*priv->msi_map) * BITS_TO_LONGS(priv->num_spis), + GFP_KERNEL); + if (!priv->msi_map) { + ret = -ENOMEM; + goto err_priv; + } + + pr_debug("Registering %d msixs, starting at %d\n", + priv->num_spis, priv->spi_first); + + ret = alpine_msix_init_domains(priv, node); + if (ret) + goto err_map; + + return 0; + +err_map: + kfree(priv->msi_map); +err_priv: + kfree(priv); + return ret; +} +IRQCHIP_DECLARE(alpine_msix, "al,alpine-msix", alpine_msix_init); diff --git a/drivers/irqchip/irq-armada-370-xp.c b/drivers/irqchip/irq-armada-370-xp.c index 3f3a8c3d2175..e7dc6cbda2a1 100644 --- a/drivers/irqchip/irq-armada-370-xp.c +++ b/drivers/irqchip/irq-armada-370-xp.c @@ -71,6 +71,7 @@ static u32 doorbell_mask_reg; static int parent_irq; #ifdef CONFIG_PCI_MSI static struct irq_domain *armada_370_xp_msi_domain; +static struct irq_domain *armada_370_xp_msi_inner_domain; static DECLARE_BITMAP(msi_used, PCI_MSI_DOORBELL_NR); static DEFINE_MUTEX(msi_used_lock); static phys_addr_t msi_doorbell_addr; @@ -115,127 +116,102 @@ static void armada_370_xp_irq_unmask(struct irq_data *d) #ifdef CONFIG_PCI_MSI -static int armada_370_xp_alloc_msi(void) -{ - int hwirq; - - mutex_lock(&msi_used_lock); - hwirq = find_first_zero_bit(&msi_used, PCI_MSI_DOORBELL_NR); - if (hwirq >= PCI_MSI_DOORBELL_NR) - hwirq = -ENOSPC; - else - set_bit(hwirq, msi_used); - mutex_unlock(&msi_used_lock); - - return hwirq; -} - -static void armada_370_xp_free_msi(int hwirq) -{ - mutex_lock(&msi_used_lock); - if (!test_bit(hwirq, msi_used)) - pr_err("trying to free unused MSI#%d\n", hwirq); - else - clear_bit(hwirq, msi_used); - mutex_unlock(&msi_used_lock); -} - -static int armada_370_xp_setup_msi_irq(struct msi_controller *chip, - struct pci_dev *pdev, - struct msi_desc *desc) -{ - struct msi_msg msg; - int virq, hwirq; - - /* We support MSI, but not MSI-X */ - if (desc->msi_attrib.is_msix) - return -EINVAL; - - hwirq = armada_370_xp_alloc_msi(); - if (hwirq < 0) - return hwirq; - - virq = irq_create_mapping(armada_370_xp_msi_domain, hwirq); - if (!virq) { - armada_370_xp_free_msi(hwirq); - return -EINVAL; - } - - irq_set_msi_desc(virq, desc); - - msg.address_lo = msi_doorbell_addr; - msg.address_hi = 0; - msg.data = 0xf00 | (hwirq + 16); - - pci_write_msi_msg(virq, &msg); - return 0; -} - -static void armada_370_xp_teardown_msi_irq(struct msi_controller *chip, - unsigned int irq) -{ - struct irq_data *d = irq_get_irq_data(irq); - unsigned long hwirq = d->hwirq; - - irq_dispose_mapping(irq); - armada_370_xp_free_msi(hwirq); -} - static struct irq_chip armada_370_xp_msi_irq_chip = { - .name = "armada_370_xp_msi_irq", - .irq_enable = pci_msi_unmask_irq, - .irq_disable = pci_msi_mask_irq, + .name = "MPIC MSI", .irq_mask = pci_msi_mask_irq, .irq_unmask = pci_msi_unmask_irq, }; -static int armada_370_xp_msi_map(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hw) -{ - irq_set_chip_and_handler(virq, &armada_370_xp_msi_irq_chip, - handle_simple_irq); +static struct msi_domain_info armada_370_xp_msi_domain_info = { + .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_MULTI_PCI_MSI), + .chip = &armada_370_xp_msi_irq_chip, +}; - return 0; +static void armada_370_xp_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) +{ + msg->address_lo = lower_32_bits(msi_doorbell_addr); + msg->address_hi = upper_32_bits(msi_doorbell_addr); + msg->data = 0xf00 | (data->hwirq + PCI_MSI_DOORBELL_START); } -static const struct irq_domain_ops armada_370_xp_msi_irq_ops = { - .map = armada_370_xp_msi_map, +static int armada_370_xp_msi_set_affinity(struct irq_data *irq_data, + const struct cpumask *mask, bool force) +{ + return -EINVAL; +} + +static struct irq_chip armada_370_xp_msi_bottom_irq_chip = { + .name = "MPIC MSI", + .irq_compose_msi_msg = armada_370_xp_compose_msi_msg, + .irq_set_affinity = armada_370_xp_msi_set_affinity, +}; + +static int armada_370_xp_msi_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *args) +{ + int hwirq, i; + + mutex_lock(&msi_used_lock); + + hwirq = bitmap_find_next_zero_area(msi_used, PCI_MSI_DOORBELL_NR, + 0, nr_irqs, 0); + if (hwirq >= PCI_MSI_DOORBELL_NR) { + mutex_unlock(&msi_used_lock); + return -ENOSPC; + } + + bitmap_set(msi_used, hwirq, nr_irqs); + mutex_unlock(&msi_used_lock); + + for (i = 0; i < nr_irqs; i++) { + irq_domain_set_info(domain, virq + i, hwirq + i, + &armada_370_xp_msi_bottom_irq_chip, + domain->host_data, handle_simple_irq, + NULL, NULL); + } + + return hwirq; +} + +static void armada_370_xp_msi_free(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + struct irq_data *d = irq_domain_get_irq_data(domain, virq); + + mutex_lock(&msi_used_lock); + bitmap_clear(msi_used, d->hwirq, nr_irqs); + mutex_unlock(&msi_used_lock); +} + +static const struct irq_domain_ops armada_370_xp_msi_domain_ops = { + .alloc = armada_370_xp_msi_alloc, + .free = armada_370_xp_msi_free, }; static int armada_370_xp_msi_init(struct device_node *node, phys_addr_t main_int_phys_base) { - struct msi_controller *msi_chip; u32 reg; - int ret; msi_doorbell_addr = main_int_phys_base + ARMADA_370_XP_SW_TRIG_INT_OFFS; - msi_chip = kzalloc(sizeof(*msi_chip), GFP_KERNEL); - if (!msi_chip) + armada_370_xp_msi_inner_domain = + irq_domain_add_linear(NULL, PCI_MSI_DOORBELL_NR, + &armada_370_xp_msi_domain_ops, NULL); + if (!armada_370_xp_msi_inner_domain) return -ENOMEM; - msi_chip->setup_irq = armada_370_xp_setup_msi_irq; - msi_chip->teardown_irq = armada_370_xp_teardown_msi_irq; - msi_chip->of_node = node; - armada_370_xp_msi_domain = - irq_domain_add_linear(NULL, PCI_MSI_DOORBELL_NR, - &armada_370_xp_msi_irq_ops, - NULL); + pci_msi_create_irq_domain(of_node_to_fwnode(node), + &armada_370_xp_msi_domain_info, + armada_370_xp_msi_inner_domain); if (!armada_370_xp_msi_domain) { - kfree(msi_chip); + irq_domain_remove(armada_370_xp_msi_inner_domain); return -ENOMEM; } - ret = of_pci_msi_chip_add(msi_chip); - if (ret < 0) { - irq_domain_remove(armada_370_xp_msi_domain); - kfree(msi_chip); - return ret; - } - reg = readl(per_cpu_int_base + ARMADA_370_XP_IN_DRBEL_MSK_OFFS) | PCI_MSI_DOORBELL_MASK; @@ -280,7 +256,7 @@ static int armada_xp_set_affinity(struct irq_data *d, #endif static struct irq_chip armada_370_xp_irq_chip = { - .name = "armada_370_xp_irq", + .name = "MPIC", .irq_mask = armada_370_xp_irq_mask, .irq_mask_ack = armada_370_xp_irq_mask, .irq_unmask = armada_370_xp_irq_unmask, @@ -427,12 +403,12 @@ static void armada_370_xp_handle_msi_irq(struct pt_regs *regs, bool is_chained) continue; if (is_chained) { - irq = irq_find_mapping(armada_370_xp_msi_domain, - msinr - 16); + irq = irq_find_mapping(armada_370_xp_msi_inner_domain, + msinr - PCI_MSI_DOORBELL_START); generic_handle_irq(irq); } else { - irq = msinr - 16; - handle_domain_irq(armada_370_xp_msi_domain, + irq = msinr - PCI_MSI_DOORBELL_START; + handle_domain_irq(armada_370_xp_msi_inner_domain, irq, regs); } } @@ -604,8 +580,8 @@ static int __init armada_370_xp_mpic_of_init(struct device_node *node, armada_370_xp_mpic_domain = irq_domain_add_linear(node, nr_irqs, &armada_370_xp_mpic_irq_ops, NULL); - BUG_ON(!armada_370_xp_mpic_domain); + armada_370_xp_mpic_domain->bus_token = DOMAIN_BUS_WIRED; /* Setup for the boot CPU */ armada_xp_mpic_perf_init(); diff --git a/drivers/irqchip/irq-ath79-cpu.c b/drivers/irqchip/irq-ath79-cpu.c new file mode 100644 index 000000000000..befe93c5a51a --- /dev/null +++ b/drivers/irqchip/irq-ath79-cpu.c @@ -0,0 +1,97 @@ +/* + * Atheros AR71xx/AR724x/AR913x specific interrupt handling + * + * Copyright (C) 2015 Alban Bedel + * Copyright (C) 2010-2011 Jaiganesh Narayanan + * Copyright (C) 2008-2011 Gabor Juhos + * Copyright (C) 2008 Imre Kaloz + * + * Parts of this file are based on Atheros' 2.6.15/2.6.31 BSP + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include +#include +#include + +#include +#include + +/* + * The IP2/IP3 lines are tied to a PCI/WMAC/USB device. Drivers for + * these devices typically allocate coherent DMA memory, however the + * DMA controller may still have some unsynchronized data in the FIFO. + * Issue a flush in the handlers to ensure that the driver sees + * the update. + * + * This array map the interrupt lines to the DDR write buffer channels. + */ + +static unsigned irq_wb_chan[8] = { + -1, -1, -1, -1, -1, -1, -1, -1, +}; + +asmlinkage void plat_irq_dispatch(void) +{ + unsigned long pending; + int irq; + + pending = read_c0_status() & read_c0_cause() & ST0_IM; + + if (!pending) { + spurious_interrupt(); + return; + } + + pending >>= CAUSEB_IP; + while (pending) { + irq = fls(pending) - 1; + if (irq < ARRAY_SIZE(irq_wb_chan) && irq_wb_chan[irq] != -1) + ath79_ddr_wb_flush(irq_wb_chan[irq]); + do_IRQ(MIPS_CPU_IRQ_BASE + irq); + pending &= ~BIT(irq); + } +} + +static int __init ar79_cpu_intc_of_init( + struct device_node *node, struct device_node *parent) +{ + int err, i, count; + + /* Fill the irq_wb_chan table */ + count = of_count_phandle_with_args( + node, "qca,ddr-wb-channels", "#qca,ddr-wb-channel-cells"); + + for (i = 0; i < count; i++) { + struct of_phandle_args args; + u32 irq = i; + + of_property_read_u32_index( + node, "qca,ddr-wb-channel-interrupts", i, &irq); + if (irq >= ARRAY_SIZE(irq_wb_chan)) + continue; + + err = of_parse_phandle_with_args( + node, "qca,ddr-wb-channels", + "#qca,ddr-wb-channel-cells", + i, &args); + if (err) + return err; + + irq_wb_chan[irq] = args.args[0]; + } + + return mips_cpu_irq_of_init(node, parent); +} +IRQCHIP_DECLARE(ar79_cpu_intc, "qca,ar7100-cpu-intc", + ar79_cpu_intc_of_init); + +void __init ath79_cpu_irq_init(unsigned irq_wb_chan2, unsigned irq_wb_chan3) +{ + irq_wb_chan[2] = irq_wb_chan2; + irq_wb_chan[3] = irq_wb_chan3; + mips_cpu_irq_init(); +} diff --git a/drivers/irqchip/irq-ath79-misc.c b/drivers/irqchip/irq-ath79-misc.c new file mode 100644 index 000000000000..aa7290784636 --- /dev/null +++ b/drivers/irqchip/irq-ath79-misc.c @@ -0,0 +1,189 @@ +/* + * Atheros AR71xx/AR724x/AR913x MISC interrupt controller + * + * Copyright (C) 2015 Alban Bedel + * Copyright (C) 2010-2011 Jaiganesh Narayanan + * Copyright (C) 2008-2011 Gabor Juhos + * Copyright (C) 2008 Imre Kaloz + * + * Parts of this file are based on Atheros' 2.6.15/2.6.31 BSP + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#define AR71XX_RESET_REG_MISC_INT_STATUS 0 +#define AR71XX_RESET_REG_MISC_INT_ENABLE 4 + +#define ATH79_MISC_IRQ_COUNT 32 + +static void ath79_misc_irq_handler(struct irq_desc *desc) +{ + struct irq_domain *domain = irq_desc_get_handler_data(desc); + struct irq_chip *chip = irq_desc_get_chip(desc); + void __iomem *base = domain->host_data; + u32 pending; + + chained_irq_enter(chip, desc); + + pending = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_STATUS) & + __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE); + + if (!pending) { + spurious_interrupt(); + chained_irq_exit(chip, desc); + return; + } + + while (pending) { + int bit = __ffs(pending); + + generic_handle_irq(irq_linear_revmap(domain, bit)); + pending &= ~BIT(bit); + } + + chained_irq_exit(chip, desc); +} + +static void ar71xx_misc_irq_unmask(struct irq_data *d) +{ + void __iomem *base = irq_data_get_irq_chip_data(d); + unsigned int irq = d->hwirq; + u32 t; + + t = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE); + __raw_writel(t | BIT(irq), base + AR71XX_RESET_REG_MISC_INT_ENABLE); + + /* flush write */ + __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE); +} + +static void ar71xx_misc_irq_mask(struct irq_data *d) +{ + void __iomem *base = irq_data_get_irq_chip_data(d); + unsigned int irq = d->hwirq; + u32 t; + + t = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE); + __raw_writel(t & ~BIT(irq), base + AR71XX_RESET_REG_MISC_INT_ENABLE); + + /* flush write */ + __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE); +} + +static void ar724x_misc_irq_ack(struct irq_data *d) +{ + void __iomem *base = irq_data_get_irq_chip_data(d); + unsigned int irq = d->hwirq; + u32 t; + + t = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_STATUS); + __raw_writel(t & ~BIT(irq), base + AR71XX_RESET_REG_MISC_INT_STATUS); + + /* flush write */ + __raw_readl(base + AR71XX_RESET_REG_MISC_INT_STATUS); +} + +static struct irq_chip ath79_misc_irq_chip = { + .name = "MISC", + .irq_unmask = ar71xx_misc_irq_unmask, + .irq_mask = ar71xx_misc_irq_mask, +}; + +static int misc_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hw) +{ + irq_set_chip_and_handler(irq, &ath79_misc_irq_chip, handle_level_irq); + irq_set_chip_data(irq, d->host_data); + return 0; +} + +static const struct irq_domain_ops misc_irq_domain_ops = { + .xlate = irq_domain_xlate_onecell, + .map = misc_map, +}; + +static void __init ath79_misc_intc_domain_init( + struct irq_domain *domain, int irq) +{ + void __iomem *base = domain->host_data; + + /* Disable and clear all interrupts */ + __raw_writel(0, base + AR71XX_RESET_REG_MISC_INT_ENABLE); + __raw_writel(0, base + AR71XX_RESET_REG_MISC_INT_STATUS); + + irq_set_chained_handler_and_data(irq, ath79_misc_irq_handler, domain); +} + +static int __init ath79_misc_intc_of_init( + struct device_node *node, struct device_node *parent) +{ + struct irq_domain *domain; + void __iomem *base; + int irq; + + irq = irq_of_parse_and_map(node, 0); + if (!irq) { + pr_err("Failed to get MISC IRQ\n"); + return -EINVAL; + } + + base = of_iomap(node, 0); + if (!base) { + pr_err("Failed to get MISC IRQ registers\n"); + return -ENOMEM; + } + + domain = irq_domain_add_linear(node, ATH79_MISC_IRQ_COUNT, + &misc_irq_domain_ops, base); + if (!domain) { + pr_err("Failed to add MISC irqdomain\n"); + return -EINVAL; + } + + ath79_misc_intc_domain_init(domain, irq); + return 0; +} + +static int __init ar7100_misc_intc_of_init( + struct device_node *node, struct device_node *parent) +{ + ath79_misc_irq_chip.irq_mask_ack = ar71xx_misc_irq_mask; + return ath79_misc_intc_of_init(node, parent); +} + +IRQCHIP_DECLARE(ar7100_misc_intc, "qca,ar7100-misc-intc", + ar7100_misc_intc_of_init); + +static int __init ar7240_misc_intc_of_init( + struct device_node *node, struct device_node *parent) +{ + ath79_misc_irq_chip.irq_ack = ar724x_misc_irq_ack; + return ath79_misc_intc_of_init(node, parent); +} + +IRQCHIP_DECLARE(ar7240_misc_intc, "qca,ar7240-misc-intc", + ar7240_misc_intc_of_init); + +void __init ath79_misc_irq_init(void __iomem *regs, int irq, + int irq_base, bool is_ar71xx) +{ + struct irq_domain *domain; + + if (is_ar71xx) + ath79_misc_irq_chip.irq_mask_ack = ar71xx_misc_irq_mask; + else + ath79_misc_irq_chip.irq_ack = ar724x_misc_irq_ack; + + domain = irq_domain_add_legacy(NULL, ATH79_MISC_IRQ_COUNT, + irq_base, 0, &misc_irq_domain_ops, regs); + if (!domain) + panic("Failed to create MISC irqdomain"); + + ath79_misc_intc_domain_init(domain, irq); +} diff --git a/drivers/irqchip/irq-atmel-aic-common.c b/drivers/irqchip/irq-atmel-aic-common.c index 37199b9b2cfa..28b26c80f4cf 100644 --- a/drivers/irqchip/irq-atmel-aic-common.c +++ b/drivers/irqchip/irq-atmel-aic-common.c @@ -80,16 +80,10 @@ int aic_common_set_type(struct irq_data *d, unsigned type, unsigned *val) return 0; } -int aic_common_set_priority(int priority, unsigned *val) +void aic_common_set_priority(int priority, unsigned *val) { - if (priority < AT91_AIC_IRQ_MIN_PRIORITY || - priority > AT91_AIC_IRQ_MAX_PRIORITY) - return -EINVAL; - *val &= ~AT91_AIC_PRIOR; *val |= priority; - - return 0; } int aic_common_irq_domain_xlate(struct irq_domain *d, @@ -193,7 +187,7 @@ void __init aic_common_rtt_irq_fixup(struct device_node *root) } } -void __init aic_common_irq_fixup(const struct of_device_id *matches) +static void __init aic_common_irq_fixup(const struct of_device_id *matches) { struct device_node *root = of_find_node_by_path("/"); const struct of_device_id *match; @@ -214,7 +208,8 @@ void __init aic_common_irq_fixup(const struct of_device_id *matches) struct irq_domain *__init aic_common_of_init(struct device_node *node, const struct irq_domain_ops *ops, - const char *name, int nirqs) + const char *name, int nirqs, + const struct of_device_id *matches) { struct irq_chip_generic *gc; struct irq_domain *domain; @@ -264,6 +259,7 @@ struct irq_domain *__init aic_common_of_init(struct device_node *node, } aic_common_ext_irq_of_init(domain); + aic_common_irq_fixup(matches); return domain; diff --git a/drivers/irqchip/irq-atmel-aic-common.h b/drivers/irqchip/irq-atmel-aic-common.h index 603f0a9d5411..af60376d50de 100644 --- a/drivers/irqchip/irq-atmel-aic-common.h +++ b/drivers/irqchip/irq-atmel-aic-common.h @@ -19,7 +19,7 @@ int aic_common_set_type(struct irq_data *d, unsigned type, unsigned *val); -int aic_common_set_priority(int priority, unsigned *val); +void aic_common_set_priority(int priority, unsigned *val); int aic_common_irq_domain_xlate(struct irq_domain *d, struct device_node *ctrlr, @@ -30,12 +30,11 @@ int aic_common_irq_domain_xlate(struct irq_domain *d, struct irq_domain *__init aic_common_of_init(struct device_node *node, const struct irq_domain_ops *ops, - const char *name, int nirqs); + const char *name, int nirqs, + const struct of_device_id *matches); void __init aic_common_rtc_irq_fixup(struct device_node *root); void __init aic_common_rtt_irq_fixup(struct device_node *root); -void __init aic_common_irq_fixup(const struct of_device_id *matches); - #endif /* __IRQ_ATMEL_AIC_COMMON_H */ diff --git a/drivers/irqchip/irq-atmel-aic.c b/drivers/irqchip/irq-atmel-aic.c index 8a0c7f288198..112e17c2768b 100644 --- a/drivers/irqchip/irq-atmel-aic.c +++ b/drivers/irqchip/irq-atmel-aic.c @@ -196,9 +196,8 @@ static int aic_irq_domain_xlate(struct irq_domain *d, irq_gc_lock(gc); smr = irq_reg_readl(gc, AT91_AIC_SMR(*out_hwirq)); - ret = aic_common_set_priority(intspec[2], &smr); - if (!ret) - irq_reg_writel(gc, smr, AT91_AIC_SMR(*out_hwirq)); + aic_common_set_priority(intspec[2], &smr); + irq_reg_writel(gc, smr, AT91_AIC_SMR(*out_hwirq)); irq_gc_unlock(gc); return ret; @@ -248,12 +247,10 @@ static int __init aic_of_init(struct device_node *node, return -EEXIST; domain = aic_common_of_init(node, &aic_irq_ops, "atmel-aic", - NR_AIC_IRQS); + NR_AIC_IRQS, aic_irq_fixups); if (IS_ERR(domain)) return PTR_ERR(domain); - aic_common_irq_fixup(aic_irq_fixups); - aic_domain = domain; gc = irq_get_domain_generic_chip(domain, 0); diff --git a/drivers/irqchip/irq-atmel-aic5.c b/drivers/irqchip/irq-atmel-aic5.c index 62bb840c613f..4f0d068e1abe 100644 --- a/drivers/irqchip/irq-atmel-aic5.c +++ b/drivers/irqchip/irq-atmel-aic5.c @@ -272,9 +272,8 @@ static int aic5_irq_domain_xlate(struct irq_domain *d, irq_gc_lock(bgc); irq_reg_writel(bgc, *out_hwirq, AT91_AIC5_SSR); smr = irq_reg_readl(bgc, AT91_AIC5_SMR); - ret = aic_common_set_priority(intspec[2], &smr); - if (!ret) - irq_reg_writel(bgc, intspec[2] | smr, AT91_AIC5_SMR); + aic_common_set_priority(intspec[2], &smr); + irq_reg_writel(bgc, smr, AT91_AIC5_SMR); irq_gc_unlock(bgc); return ret; @@ -312,12 +311,10 @@ static int __init aic5_of_init(struct device_node *node, return -EEXIST; domain = aic_common_of_init(node, &aic5_irq_ops, "atmel-aic5", - nirqs); + nirqs, aic5_irq_fixups); if (IS_ERR(domain)) return PTR_ERR(domain); - aic_common_irq_fixup(aic5_irq_fixups); - aic5_domain = domain; nchips = aic5_domain->revmap_size / 32; for (i = 0; i < nchips; i++) { diff --git a/drivers/irqchip/irq-bcm2836.c b/drivers/irqchip/irq-bcm2836.c index 963065a0d774..b6e950d4782a 100644 --- a/drivers/irqchip/irq-bcm2836.c +++ b/drivers/irqchip/irq-bcm2836.c @@ -229,7 +229,6 @@ int __init bcm2836_smp_boot_secondary(unsigned int cpu, unsigned long secondary_startup_phys = (unsigned long)virt_to_phys((void *)secondary_startup); - dsb(); writel(secondary_startup_phys, intc.base + LOCAL_MAILBOX3_SET0 + 16 * cpu); diff --git a/drivers/irqchip/irq-bcm6345-l1.c b/drivers/irqchip/irq-bcm6345-l1.c new file mode 100644 index 000000000000..b844c89a9506 --- /dev/null +++ b/drivers/irqchip/irq-bcm6345-l1.c @@ -0,0 +1,364 @@ +/* + * Broadcom BCM6345 style Level 1 interrupt controller driver + * + * Copyright (C) 2014 Broadcom Corporation + * Copyright 2015 Simon Arlott + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This is based on the BCM7038 (which supports SMP) but with a single + * enable register instead of separate mask/set/clear registers. + * + * The BCM3380 has a similar mask/status register layout, but each pair + * of words is at separate locations (and SMP is not supported). + * + * ENABLE/STATUS words are packed next to each other for each CPU: + * + * BCM6368: + * 0x1000_0020: CPU0_W0_ENABLE + * 0x1000_0024: CPU0_W1_ENABLE + * 0x1000_0028: CPU0_W0_STATUS IRQs 31-63 + * 0x1000_002c: CPU0_W1_STATUS IRQs 0-31 + * 0x1000_0030: CPU1_W0_ENABLE + * 0x1000_0034: CPU1_W1_ENABLE + * 0x1000_0038: CPU1_W0_STATUS IRQs 31-63 + * 0x1000_003c: CPU1_W1_STATUS IRQs 0-31 + * + * BCM63168: + * 0x1000_0020: CPU0_W0_ENABLE + * 0x1000_0024: CPU0_W1_ENABLE + * 0x1000_0028: CPU0_W2_ENABLE + * 0x1000_002c: CPU0_W3_ENABLE + * 0x1000_0030: CPU0_W0_STATUS IRQs 96-127 + * 0x1000_0034: CPU0_W1_STATUS IRQs 64-95 + * 0x1000_0038: CPU0_W2_STATUS IRQs 32-63 + * 0x1000_003c: CPU0_W3_STATUS IRQs 0-31 + * 0x1000_0040: CPU1_W0_ENABLE + * 0x1000_0044: CPU1_W1_ENABLE + * 0x1000_0048: CPU1_W2_ENABLE + * 0x1000_004c: CPU1_W3_ENABLE + * 0x1000_0050: CPU1_W0_STATUS IRQs 96-127 + * 0x1000_0054: CPU1_W1_STATUS IRQs 64-95 + * 0x1000_0058: CPU1_W2_STATUS IRQs 32-63 + * 0x1000_005c: CPU1_W3_STATUS IRQs 0-31 + * + * IRQs are numbered in CPU native endian order + * (which is big-endian in these examples) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define IRQS_PER_WORD 32 +#define REG_BYTES_PER_IRQ_WORD (sizeof(u32) * 2) + +struct bcm6345_l1_cpu; + +struct bcm6345_l1_chip { + raw_spinlock_t lock; + unsigned int n_words; + struct irq_domain *domain; + struct cpumask cpumask; + struct bcm6345_l1_cpu *cpus[NR_CPUS]; +}; + +struct bcm6345_l1_cpu { + void __iomem *map_base; + unsigned int parent_irq; + u32 enable_cache[]; +}; + +static inline unsigned int reg_enable(struct bcm6345_l1_chip *intc, + unsigned int word) +{ +#ifdef __BIG_ENDIAN + return (1 * intc->n_words - word - 1) * sizeof(u32); +#else + return (0 * intc->n_words + word) * sizeof(u32); +#endif +} + +static inline unsigned int reg_status(struct bcm6345_l1_chip *intc, + unsigned int word) +{ +#ifdef __BIG_ENDIAN + return (2 * intc->n_words - word - 1) * sizeof(u32); +#else + return (1 * intc->n_words + word) * sizeof(u32); +#endif +} + +static inline unsigned int cpu_for_irq(struct bcm6345_l1_chip *intc, + struct irq_data *d) +{ + return cpumask_first_and(&intc->cpumask, irq_data_get_affinity_mask(d)); +} + +static void bcm6345_l1_irq_handle(struct irq_desc *desc) +{ + struct bcm6345_l1_chip *intc = irq_desc_get_handler_data(desc); + struct bcm6345_l1_cpu *cpu; + struct irq_chip *chip = irq_desc_get_chip(desc); + unsigned int idx; + +#ifdef CONFIG_SMP + cpu = intc->cpus[cpu_logical_map(smp_processor_id())]; +#else + cpu = intc->cpus[0]; +#endif + + chained_irq_enter(chip, desc); + + for (idx = 0; idx < intc->n_words; idx++) { + int base = idx * IRQS_PER_WORD; + unsigned long pending; + irq_hw_number_t hwirq; + unsigned int irq; + + pending = __raw_readl(cpu->map_base + reg_status(intc, idx)); + pending &= __raw_readl(cpu->map_base + reg_enable(intc, idx)); + + for_each_set_bit(hwirq, &pending, IRQS_PER_WORD) { + irq = irq_linear_revmap(intc->domain, base + hwirq); + if (irq) + do_IRQ(irq); + else + spurious_interrupt(); + } + } + + chained_irq_exit(chip, desc); +} + +static inline void __bcm6345_l1_unmask(struct irq_data *d) +{ + struct bcm6345_l1_chip *intc = irq_data_get_irq_chip_data(d); + u32 word = d->hwirq / IRQS_PER_WORD; + u32 mask = BIT(d->hwirq % IRQS_PER_WORD); + unsigned int cpu_idx = cpu_for_irq(intc, d); + + intc->cpus[cpu_idx]->enable_cache[word] |= mask; + __raw_writel(intc->cpus[cpu_idx]->enable_cache[word], + intc->cpus[cpu_idx]->map_base + reg_enable(intc, word)); +} + +static inline void __bcm6345_l1_mask(struct irq_data *d) +{ + struct bcm6345_l1_chip *intc = irq_data_get_irq_chip_data(d); + u32 word = d->hwirq / IRQS_PER_WORD; + u32 mask = BIT(d->hwirq % IRQS_PER_WORD); + unsigned int cpu_idx = cpu_for_irq(intc, d); + + intc->cpus[cpu_idx]->enable_cache[word] &= ~mask; + __raw_writel(intc->cpus[cpu_idx]->enable_cache[word], + intc->cpus[cpu_idx]->map_base + reg_enable(intc, word)); +} + +static void bcm6345_l1_unmask(struct irq_data *d) +{ + struct bcm6345_l1_chip *intc = irq_data_get_irq_chip_data(d); + unsigned long flags; + + raw_spin_lock_irqsave(&intc->lock, flags); + __bcm6345_l1_unmask(d); + raw_spin_unlock_irqrestore(&intc->lock, flags); +} + +static void bcm6345_l1_mask(struct irq_data *d) +{ + struct bcm6345_l1_chip *intc = irq_data_get_irq_chip_data(d); + unsigned long flags; + + raw_spin_lock_irqsave(&intc->lock, flags); + __bcm6345_l1_mask(d); + raw_spin_unlock_irqrestore(&intc->lock, flags); +} + +static int bcm6345_l1_set_affinity(struct irq_data *d, + const struct cpumask *dest, + bool force) +{ + struct bcm6345_l1_chip *intc = irq_data_get_irq_chip_data(d); + u32 word = d->hwirq / IRQS_PER_WORD; + u32 mask = BIT(d->hwirq % IRQS_PER_WORD); + unsigned int old_cpu = cpu_for_irq(intc, d); + unsigned int new_cpu; + struct cpumask valid; + unsigned long flags; + bool enabled; + + if (!cpumask_and(&valid, &intc->cpumask, dest)) + return -EINVAL; + + new_cpu = cpumask_any_and(&valid, cpu_online_mask); + if (new_cpu >= nr_cpu_ids) + return -EINVAL; + + dest = cpumask_of(new_cpu); + + raw_spin_lock_irqsave(&intc->lock, flags); + if (old_cpu != new_cpu) { + enabled = intc->cpus[old_cpu]->enable_cache[word] & mask; + if (enabled) + __bcm6345_l1_mask(d); + cpumask_copy(irq_data_get_affinity_mask(d), dest); + if (enabled) + __bcm6345_l1_unmask(d); + } else { + cpumask_copy(irq_data_get_affinity_mask(d), dest); + } + raw_spin_unlock_irqrestore(&intc->lock, flags); + + return IRQ_SET_MASK_OK_NOCOPY; +} + +static int __init bcm6345_l1_init_one(struct device_node *dn, + unsigned int idx, + struct bcm6345_l1_chip *intc) +{ + struct resource res; + resource_size_t sz; + struct bcm6345_l1_cpu *cpu; + unsigned int i, n_words; + + if (of_address_to_resource(dn, idx, &res)) + return -EINVAL; + sz = resource_size(&res); + n_words = sz / REG_BYTES_PER_IRQ_WORD; + + if (!intc->n_words) + intc->n_words = n_words; + else if (intc->n_words != n_words) + return -EINVAL; + + cpu = intc->cpus[idx] = kzalloc(sizeof(*cpu) + n_words * sizeof(u32), + GFP_KERNEL); + if (!cpu) + return -ENOMEM; + + cpu->map_base = ioremap(res.start, sz); + if (!cpu->map_base) + return -ENOMEM; + + for (i = 0; i < n_words; i++) { + cpu->enable_cache[i] = 0; + __raw_writel(0, cpu->map_base + reg_enable(intc, i)); + } + + cpu->parent_irq = irq_of_parse_and_map(dn, idx); + if (!cpu->parent_irq) { + pr_err("failed to map parent interrupt %d\n", cpu->parent_irq); + return -EINVAL; + } + irq_set_chained_handler_and_data(cpu->parent_irq, + bcm6345_l1_irq_handle, intc); + + return 0; +} + +static struct irq_chip bcm6345_l1_irq_chip = { + .name = "bcm6345-l1", + .irq_mask = bcm6345_l1_mask, + .irq_unmask = bcm6345_l1_unmask, + .irq_set_affinity = bcm6345_l1_set_affinity, +}; + +static int bcm6345_l1_map(struct irq_domain *d, unsigned int virq, + irq_hw_number_t hw_irq) +{ + irq_set_chip_and_handler(virq, + &bcm6345_l1_irq_chip, handle_percpu_irq); + irq_set_chip_data(virq, d->host_data); + return 0; +} + +static const struct irq_domain_ops bcm6345_l1_domain_ops = { + .xlate = irq_domain_xlate_onecell, + .map = bcm6345_l1_map, +}; + +static int __init bcm6345_l1_of_init(struct device_node *dn, + struct device_node *parent) +{ + struct bcm6345_l1_chip *intc; + unsigned int idx; + int ret; + + intc = kzalloc(sizeof(*intc), GFP_KERNEL); + if (!intc) + return -ENOMEM; + + for_each_possible_cpu(idx) { + ret = bcm6345_l1_init_one(dn, idx, intc); + if (ret) + pr_err("failed to init intc L1 for cpu %d: %d\n", + idx, ret); + else + cpumask_set_cpu(idx, &intc->cpumask); + } + + if (!cpumask_weight(&intc->cpumask)) { + ret = -ENODEV; + goto out_free; + } + + raw_spin_lock_init(&intc->lock); + + intc->domain = irq_domain_add_linear(dn, IRQS_PER_WORD * intc->n_words, + &bcm6345_l1_domain_ops, + intc); + if (!intc->domain) { + ret = -ENOMEM; + goto out_unmap; + } + + pr_info("registered BCM6345 L1 intc (IRQs: %d)\n", + IRQS_PER_WORD * intc->n_words); + for_each_cpu(idx, &intc->cpumask) { + struct bcm6345_l1_cpu *cpu = intc->cpus[idx]; + + pr_info(" CPU%u at MMIO 0x%p (irq = %d)\n", idx, + cpu->map_base, cpu->parent_irq); + } + + return 0; + +out_unmap: + for_each_possible_cpu(idx) { + struct bcm6345_l1_cpu *cpu = intc->cpus[idx]; + + if (cpu) { + if (cpu->map_base) + iounmap(cpu->map_base); + kfree(cpu); + } + } +out_free: + kfree(intc); + return ret; +} + +IRQCHIP_DECLARE(bcm6345_l1, "brcm,bcm6345-l1-intc", bcm6345_l1_of_init); diff --git a/drivers/irqchip/irq-gic-realview.c b/drivers/irqchip/irq-gic-realview.c index aa46eb280a7f..54c296401525 100644 --- a/drivers/irqchip/irq-gic-realview.c +++ b/drivers/irqchip/irq-gic-realview.c @@ -10,7 +10,8 @@ #include #define REALVIEW_SYS_LOCK_OFFSET 0x20 -#define REALVIEW_PB11MP_SYS_PLD_CTRL1 0x74 +#define REALVIEW_SYS_PLD_CTRL1 0x74 +#define REALVIEW_EB_REVB_SYS_PLD_CTRL1 0xD8 #define VERSATILE_LOCK_VAL 0xA05F #define PLD_INTMODE_MASK BIT(22)|BIT(23)|BIT(24) #define PLD_INTMODE_LEGACY 0x0 @@ -18,26 +19,57 @@ #define PLD_INTMODE_NEW_NO_DCC BIT(23) #define PLD_INTMODE_FIQ_ENABLE BIT(24) +/* For some reason RealView EB Rev B moved this register */ +static const struct of_device_id syscon_pldset_of_match[] = { + { + .compatible = "arm,realview-eb11mp-revb-syscon", + .data = (void *)REALVIEW_EB_REVB_SYS_PLD_CTRL1, + }, + { + .compatible = "arm,realview-eb11mp-revc-syscon", + .data = (void *)REALVIEW_SYS_PLD_CTRL1, + }, + { + .compatible = "arm,realview-eb-syscon", + .data = (void *)REALVIEW_SYS_PLD_CTRL1, + }, + { + .compatible = "arm,realview-pb11mp-syscon", + .data = (void *)REALVIEW_SYS_PLD_CTRL1, + }, + {}, +}; + static int __init realview_gic_of_init(struct device_node *node, struct device_node *parent) { static struct regmap *map; + struct device_node *np; + const struct of_device_id *gic_id; + u32 pld1_ctrl; + + np = of_find_matching_node_and_match(NULL, syscon_pldset_of_match, + &gic_id); + if (!np) + return -ENODEV; + pld1_ctrl = (u32)gic_id->data; /* The PB11MPCore GIC needs to be configured in the syscon */ - map = syscon_regmap_lookup_by_compatible("arm,realview-pb11mp-syscon"); + map = syscon_node_to_regmap(np); if (!IS_ERR(map)) { /* new irq mode with no DCC */ regmap_write(map, REALVIEW_SYS_LOCK_OFFSET, VERSATILE_LOCK_VAL); - regmap_update_bits(map, REALVIEW_PB11MP_SYS_PLD_CTRL1, + regmap_update_bits(map, pld1_ctrl, PLD_INTMODE_NEW_NO_DCC, PLD_INTMODE_MASK); regmap_write(map, REALVIEW_SYS_LOCK_OFFSET, 0x0000); - pr_info("TC11MP GIC: set up interrupt controller to NEW mode, no DCC\n"); + pr_info("RealView GIC: set up interrupt controller to NEW mode, no DCC\n"); } else { - pr_err("TC11MP GIC setup: could not find syscon\n"); - return -ENXIO; + pr_err("RealView GIC setup: could not find syscon\n"); + return -ENODEV; } return gic_of_init(node, parent); } IRQCHIP_DECLARE(armtc11mp_gic, "arm,tc11mp-gic", realview_gic_of_init); +IRQCHIP_DECLARE(armeb11mp_gic, "arm,eb11mp-gic", realview_gic_of_init); diff --git a/drivers/irqchip/irq-gic-v2m.c b/drivers/irqchip/irq-gic-v2m.c index c779f83e511d..28f047c61baa 100644 --- a/drivers/irqchip/irq-gic-v2m.c +++ b/drivers/irqchip/irq-gic-v2m.c @@ -92,18 +92,6 @@ static struct msi_domain_info gicv2m_msi_domain_info = { .chip = &gicv2m_msi_irq_chip, }; -static int gicv2m_set_affinity(struct irq_data *irq_data, - const struct cpumask *mask, bool force) -{ - int ret; - - ret = irq_chip_set_affinity_parent(irq_data, mask, force); - if (ret == IRQ_SET_MASK_OK) - ret = IRQ_SET_MASK_OK_DONE; - - return ret; -} - static void gicv2m_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) { struct v2m_data *v2m = irq_data_get_irq_chip_data(data); @@ -122,7 +110,7 @@ static struct irq_chip gicv2m_irq_chip = { .irq_mask = irq_chip_mask_parent, .irq_unmask = irq_chip_unmask_parent, .irq_eoi = irq_chip_eoi_parent, - .irq_set_affinity = gicv2m_set_affinity, + .irq_set_affinity = irq_chip_set_affinity_parent, .irq_compose_msi_msg = gicv2m_compose_msi_msg, }; diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 43dfd15c1dd2..39261798c59f 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -103,7 +103,6 @@ struct its_device { static LIST_HEAD(its_nodes); static DEFINE_SPINLOCK(its_lock); -static struct device_node *gic_root_node; static struct rdists *gic_rdists; #define gic_data_rdist() (raw_cpu_ptr(gic_rdists->rdist)) @@ -671,7 +670,7 @@ static int its_chunk_to_lpi(int chunk) return (chunk << IRQS_PER_CHUNK_SHIFT) + 8192; } -static int its_lpi_init(u32 id_bits) +static int __init its_lpi_init(u32 id_bits) { lpi_chunks = its_lpi_to_chunk(1UL << id_bits); @@ -1430,7 +1429,8 @@ static void its_enable_quirks(struct its_node *its) gic_enable_quirks(iidr, its_quirks, its); } -static int its_probe(struct device_node *node, struct irq_domain *parent) +static int __init its_probe(struct device_node *node, + struct irq_domain *parent) { struct resource res; struct its_node *its; @@ -1591,7 +1591,7 @@ static struct of_device_id its_device_id[] = { {}, }; -int its_init(struct device_node *node, struct rdists *rdists, +int __init its_init(struct device_node *node, struct rdists *rdists, struct irq_domain *parent_domain) { struct device_node *np; @@ -1607,8 +1607,6 @@ int its_init(struct device_node *node, struct rdists *rdists, } gic_rdists = rdists; - gic_root_node = node; - its_alloc_lpi_tables(); its_lpi_init(rdists->id_bits); diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index d7be6ddc34f6..5b7d3c2129d8 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -15,10 +15,12 @@ * along with this program. If not, see . */ +#include #include #include #include #include +#include #include #include #include @@ -38,6 +40,7 @@ struct redist_region { void __iomem *redist_base; phys_addr_t phys_base; + bool single_redist; }; struct gic_chip_data { @@ -434,6 +437,9 @@ static int gic_populate_rdist(void) return 0; } + if (gic_data.redist_regions[i].single_redist) + break; + if (gic_data.redist_stride) { ptr += gic_data.redist_stride; } else { @@ -634,7 +640,7 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val, else gic_dist_wait_for_rwp(); - return IRQ_SET_MASK_OK; + return IRQ_SET_MASK_OK_DONE; } #else #define gic_set_affinity NULL @@ -764,6 +770,15 @@ static int gic_irq_domain_translate(struct irq_domain *d, return 0; } + if (is_fwnode_irqchip(fwspec->fwnode)) { + if(fwspec->param_count != 2) + return -EINVAL; + + *hwirq = fwspec->param[0]; + *type = fwspec->param[1]; + return 0; + } + return -EINVAL; } @@ -811,17 +826,88 @@ static void gicv3_enable_quirks(void) #endif } +static int __init gic_init_bases(void __iomem *dist_base, + struct redist_region *rdist_regs, + u32 nr_redist_regions, + u64 redist_stride, + struct fwnode_handle *handle) +{ + struct device_node *node; + u32 typer; + int gic_irqs; + int err; + + if (!is_hyp_mode_available()) + static_key_slow_dec(&supports_deactivate); + + if (static_key_true(&supports_deactivate)) + pr_info("GIC: Using split EOI/Deactivate mode\n"); + + gic_data.dist_base = dist_base; + gic_data.redist_regions = rdist_regs; + gic_data.nr_redist_regions = nr_redist_regions; + gic_data.redist_stride = redist_stride; + + gicv3_enable_quirks(); + + /* + * Find out how many interrupts are supported. + * The GIC only supports up to 1020 interrupt sources (SGI+PPI+SPI) + */ + typer = readl_relaxed(gic_data.dist_base + GICD_TYPER); + gic_data.rdists.id_bits = GICD_TYPER_ID_BITS(typer); + gic_irqs = GICD_TYPER_IRQS(typer); + if (gic_irqs > 1020) + gic_irqs = 1020; + gic_data.irq_nr = gic_irqs; + + gic_data.domain = irq_domain_create_tree(handle, &gic_irq_domain_ops, + &gic_data); + gic_data.rdists.rdist = alloc_percpu(typeof(*gic_data.rdists.rdist)); + + if (WARN_ON(!gic_data.domain) || WARN_ON(!gic_data.rdists.rdist)) { + err = -ENOMEM; + goto out_free; + } + + set_handle_irq(gic_handle_irq); + + node = to_of_node(handle); + if (IS_ENABLED(CONFIG_ARM_GIC_V3_ITS) && gic_dist_supports_lpis() && + node) /* Temp hack to prevent ITS init for ACPI */ + its_init(node, &gic_data.rdists, gic_data.domain); + + gic_smp_init(); + gic_dist_init(); + gic_cpu_init(); + gic_cpu_pm_init(); + + return 0; + +out_free: + if (gic_data.domain) + irq_domain_remove(gic_data.domain); + free_percpu(gic_data.rdists.rdist); + return err; +} + +static int __init gic_validate_dist_version(void __iomem *dist_base) +{ + u32 reg = readl_relaxed(dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK; + + if (reg != GIC_PIDR2_ARCH_GICv3 && reg != GIC_PIDR2_ARCH_GICv4) + return -ENODEV; + + return 0; +} + static int __init gic_of_init(struct device_node *node, struct device_node *parent) { void __iomem *dist_base; struct redist_region *rdist_regs; u64 redist_stride; u32 nr_redist_regions; - u32 typer; - u32 reg; - int gic_irqs; - int err; - int i; + int err, i; dist_base = of_iomap(node, 0); if (!dist_base) { @@ -830,11 +916,10 @@ static int __init gic_of_init(struct device_node *node, struct device_node *pare return -ENXIO; } - reg = readl_relaxed(dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK; - if (reg != GIC_PIDR2_ARCH_GICv3 && reg != GIC_PIDR2_ARCH_GICv4) { + err = gic_validate_dist_version(dist_base); + if (err) { pr_err("%s: no distributor detected, giving up\n", node->full_name); - err = -ENODEV; goto out_unmap_dist; } @@ -865,55 +950,11 @@ static int __init gic_of_init(struct device_node *node, struct device_node *pare if (of_property_read_u64(node, "redistributor-stride", &redist_stride)) redist_stride = 0; - if (!is_hyp_mode_available()) - static_key_slow_dec(&supports_deactivate); + err = gic_init_bases(dist_base, rdist_regs, nr_redist_regions, + redist_stride, &node->fwnode); + if (!err) + return 0; - if (static_key_true(&supports_deactivate)) - pr_info("GIC: Using split EOI/Deactivate mode\n"); - - gic_data.dist_base = dist_base; - gic_data.redist_regions = rdist_regs; - gic_data.nr_redist_regions = nr_redist_regions; - gic_data.redist_stride = redist_stride; - - gicv3_enable_quirks(); - - /* - * Find out how many interrupts are supported. - * The GIC only supports up to 1020 interrupt sources (SGI+PPI+SPI) - */ - typer = readl_relaxed(gic_data.dist_base + GICD_TYPER); - gic_data.rdists.id_bits = GICD_TYPER_ID_BITS(typer); - gic_irqs = GICD_TYPER_IRQS(typer); - if (gic_irqs > 1020) - gic_irqs = 1020; - gic_data.irq_nr = gic_irqs; - - gic_data.domain = irq_domain_add_tree(node, &gic_irq_domain_ops, - &gic_data); - gic_data.rdists.rdist = alloc_percpu(typeof(*gic_data.rdists.rdist)); - - if (WARN_ON(!gic_data.domain) || WARN_ON(!gic_data.rdists.rdist)) { - err = -ENOMEM; - goto out_free; - } - - set_handle_irq(gic_handle_irq); - - if (IS_ENABLED(CONFIG_ARM_GIC_V3_ITS) && gic_dist_supports_lpis()) - its_init(node, &gic_data.rdists, gic_data.domain); - - gic_smp_init(); - gic_dist_init(); - gic_cpu_init(); - gic_cpu_pm_init(); - - return 0; - -out_free: - if (gic_data.domain) - irq_domain_remove(gic_data.domain); - free_percpu(gic_data.rdists.rdist); out_unmap_rdist: for (i = 0; i < nr_redist_regions; i++) if (rdist_regs[i].redist_base) @@ -925,3 +966,213 @@ out_unmap_dist: } IRQCHIP_DECLARE(gic_v3, "arm,gic-v3", gic_of_init); + +#ifdef CONFIG_ACPI +static void __iomem *dist_base; +static struct redist_region *redist_regs __initdata; +static u32 nr_redist_regions __initdata; +static bool single_redist; + +static void __init +gic_acpi_register_redist(phys_addr_t phys_base, void __iomem *redist_base) +{ + static int count = 0; + + redist_regs[count].phys_base = phys_base; + redist_regs[count].redist_base = redist_base; + redist_regs[count].single_redist = single_redist; + count++; +} + +static int __init +gic_acpi_parse_madt_redist(struct acpi_subtable_header *header, + const unsigned long end) +{ + struct acpi_madt_generic_redistributor *redist = + (struct acpi_madt_generic_redistributor *)header; + void __iomem *redist_base; + + redist_base = ioremap(redist->base_address, redist->length); + if (!redist_base) { + pr_err("Couldn't map GICR region @%llx\n", redist->base_address); + return -ENOMEM; + } + + gic_acpi_register_redist(redist->base_address, redist_base); + return 0; +} + +static int __init +gic_acpi_parse_madt_gicc(struct acpi_subtable_header *header, + const unsigned long end) +{ + struct acpi_madt_generic_interrupt *gicc = + (struct acpi_madt_generic_interrupt *)header; + u32 reg = readl_relaxed(dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK; + u32 size = reg == GIC_PIDR2_ARCH_GICv4 ? SZ_64K * 4 : SZ_64K * 2; + void __iomem *redist_base; + + redist_base = ioremap(gicc->gicr_base_address, size); + if (!redist_base) + return -ENOMEM; + + gic_acpi_register_redist(gicc->gicr_base_address, redist_base); + return 0; +} + +static int __init gic_acpi_collect_gicr_base(void) +{ + acpi_tbl_entry_handler redist_parser; + enum acpi_madt_type type; + + if (single_redist) { + type = ACPI_MADT_TYPE_GENERIC_INTERRUPT; + redist_parser = gic_acpi_parse_madt_gicc; + } else { + type = ACPI_MADT_TYPE_GENERIC_REDISTRIBUTOR; + redist_parser = gic_acpi_parse_madt_redist; + } + + /* Collect redistributor base addresses in GICR entries */ + if (acpi_table_parse_madt(type, redist_parser, 0) > 0) + return 0; + + pr_info("No valid GICR entries exist\n"); + return -ENODEV; +} + +static int __init gic_acpi_match_gicr(struct acpi_subtable_header *header, + const unsigned long end) +{ + /* Subtable presence means that redist exists, that's it */ + return 0; +} + +static int __init gic_acpi_match_gicc(struct acpi_subtable_header *header, + const unsigned long end) +{ + struct acpi_madt_generic_interrupt *gicc = + (struct acpi_madt_generic_interrupt *)header; + + /* + * If GICC is enabled and has valid gicr base address, then it means + * GICR base is presented via GICC + */ + if ((gicc->flags & ACPI_MADT_ENABLED) && gicc->gicr_base_address) + return 0; + + return -ENODEV; +} + +static int __init gic_acpi_count_gicr_regions(void) +{ + int count; + + /* + * Count how many redistributor regions we have. It is not allowed + * to mix redistributor description, GICR and GICC subtables have to be + * mutually exclusive. + */ + count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_REDISTRIBUTOR, + gic_acpi_match_gicr, 0); + if (count > 0) { + single_redist = false; + return count; + } + + count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT, + gic_acpi_match_gicc, 0); + if (count > 0) + single_redist = true; + + return count; +} + +static bool __init acpi_validate_gic_table(struct acpi_subtable_header *header, + struct acpi_probe_entry *ape) +{ + struct acpi_madt_generic_distributor *dist; + int count; + + dist = (struct acpi_madt_generic_distributor *)header; + if (dist->version != ape->driver_data) + return false; + + /* We need to do that exercise anyway, the sooner the better */ + count = gic_acpi_count_gicr_regions(); + if (count <= 0) + return false; + + nr_redist_regions = count; + return true; +} + +#define ACPI_GICV3_DIST_MEM_SIZE (SZ_64K) + +static int __init +gic_acpi_init(struct acpi_subtable_header *header, const unsigned long end) +{ + struct acpi_madt_generic_distributor *dist; + struct fwnode_handle *domain_handle; + int i, err; + + /* Get distributor base address */ + dist = (struct acpi_madt_generic_distributor *)header; + dist_base = ioremap(dist->base_address, ACPI_GICV3_DIST_MEM_SIZE); + if (!dist_base) { + pr_err("Unable to map GICD registers\n"); + return -ENOMEM; + } + + err = gic_validate_dist_version(dist_base); + if (err) { + pr_err("No distributor detected at @%p, giving up", dist_base); + goto out_dist_unmap; + } + + redist_regs = kzalloc(sizeof(*redist_regs) * nr_redist_regions, + GFP_KERNEL); + if (!redist_regs) { + err = -ENOMEM; + goto out_dist_unmap; + } + + err = gic_acpi_collect_gicr_base(); + if (err) + goto out_redist_unmap; + + domain_handle = irq_domain_alloc_fwnode(dist_base); + if (!domain_handle) { + err = -ENOMEM; + goto out_redist_unmap; + } + + err = gic_init_bases(dist_base, redist_regs, nr_redist_regions, 0, + domain_handle); + if (err) + goto out_fwhandle_free; + + acpi_set_irq_model(ACPI_IRQ_MODEL_GIC, domain_handle); + return 0; + +out_fwhandle_free: + irq_domain_free_fwnode(domain_handle); +out_redist_unmap: + for (i = 0; i < nr_redist_regions; i++) + if (redist_regs[i].redist_base) + iounmap(redist_regs[i].redist_base); + kfree(redist_regs); +out_dist_unmap: + iounmap(dist_base); + return err; +} +IRQCHIP_ACPI_DECLARE(gic_v3, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR, + acpi_validate_gic_table, ACPI_MADT_GIC_VERSION_V3, + gic_acpi_init); +IRQCHIP_ACPI_DECLARE(gic_v4, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR, + acpi_validate_gic_table, ACPI_MADT_GIC_VERSION_V4, + gic_acpi_init); +IRQCHIP_ACPI_DECLARE(gic_v3_or_v4, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR, + acpi_validate_gic_table, ACPI_MADT_GIC_VERSION_NONE, + gic_acpi_init); +#endif diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index 8f9ebf714e2b..282344b95ec2 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -319,7 +319,7 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val, writel_relaxed(val | bit, reg); raw_spin_unlock_irqrestore(&irq_controller_lock, flags); - return IRQ_SET_MASK_OK; + return IRQ_SET_MASK_OK_DONE; } #endif diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index 9e17ef27a183..94a30da0cfac 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -29,16 +29,32 @@ struct gic_pcpu_mask { DECLARE_BITMAP(pcpu_mask, GIC_MAX_INTRS); }; +struct gic_irq_spec { + enum { + GIC_DEVICE, + GIC_IPI + } type; + + union { + struct cpumask *ipimask; + unsigned int hwirq; + }; +}; + static unsigned long __gic_base_addr; + static void __iomem *gic_base; static struct gic_pcpu_mask pcpu_masks[NR_CPUS]; static DEFINE_SPINLOCK(gic_lock); static struct irq_domain *gic_irq_domain; +static struct irq_domain *gic_dev_domain; +static struct irq_domain *gic_ipi_domain; static int gic_shared_intrs; static int gic_vpes; static unsigned int gic_cpu_pin; static unsigned int timer_cpu_pin; static struct irq_chip gic_level_irq_controller, gic_edge_irq_controller; +DECLARE_BITMAP(ipi_resrv, GIC_MAX_INTRS); static void __gic_irq_dispatch(void); @@ -264,9 +280,11 @@ static void gic_bind_eic_interrupt(int irq, int set) GIC_VPE_EIC_SS(irq), set); } -void gic_send_ipi(unsigned int intr) +static void gic_send_ipi(struct irq_data *d, unsigned int cpu) { - gic_write(GIC_REG(SHARED, GIC_SH_WEDGE), GIC_SH_WEDGE_SET(intr)); + irq_hw_number_t hwirq = GIC_HWIRQ_TO_SHARED(irqd_to_hwirq(d)); + + gic_write(GIC_REG(SHARED, GIC_SH_WEDGE), GIC_SH_WEDGE_SET(hwirq)); } int gic_get_c0_compare_int(void) @@ -449,7 +467,7 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *cpumask, gic_map_to_vpe(irq, mips_cm_vp_id(cpumask_first(&tmp))); /* Update the pcpu_masks */ - for (i = 0; i < NR_CPUS; i++) + for (i = 0; i < gic_vpes; i++) clear_bit(irq, pcpu_masks[i].pcpu_mask); set_bit(irq, pcpu_masks[cpumask_first(&tmp)].pcpu_mask); @@ -479,6 +497,7 @@ static struct irq_chip gic_edge_irq_controller = { #ifdef CONFIG_SMP .irq_set_affinity = gic_set_affinity, #endif + .ipi_send_single = gic_send_ipi, }; static void gic_handle_local_int(bool chained) @@ -572,83 +591,6 @@ static void gic_irq_dispatch(struct irq_desc *desc) gic_handle_shared_int(true); } -#ifdef CONFIG_MIPS_GIC_IPI -static int gic_resched_int_base; -static int gic_call_int_base; - -unsigned int plat_ipi_resched_int_xlate(unsigned int cpu) -{ - return gic_resched_int_base + cpu; -} - -unsigned int plat_ipi_call_int_xlate(unsigned int cpu) -{ - return gic_call_int_base + cpu; -} - -static irqreturn_t ipi_resched_interrupt(int irq, void *dev_id) -{ - scheduler_ipi(); - - return IRQ_HANDLED; -} - -static irqreturn_t ipi_call_interrupt(int irq, void *dev_id) -{ - generic_smp_call_function_interrupt(); - - return IRQ_HANDLED; -} - -static struct irqaction irq_resched = { - .handler = ipi_resched_interrupt, - .flags = IRQF_PERCPU, - .name = "IPI resched" -}; - -static struct irqaction irq_call = { - .handler = ipi_call_interrupt, - .flags = IRQF_PERCPU, - .name = "IPI call" -}; - -static __init void gic_ipi_init_one(unsigned int intr, int cpu, - struct irqaction *action) -{ - int virq = irq_create_mapping(gic_irq_domain, - GIC_SHARED_TO_HWIRQ(intr)); - int i; - - gic_map_to_vpe(intr, mips_cm_vp_id(cpu)); - for (i = 0; i < NR_CPUS; i++) - clear_bit(intr, pcpu_masks[i].pcpu_mask); - set_bit(intr, pcpu_masks[cpu].pcpu_mask); - - irq_set_irq_type(virq, IRQ_TYPE_EDGE_RISING); - - irq_set_handler(virq, handle_percpu_irq); - setup_irq(virq, action); -} - -static __init void gic_ipi_init(void) -{ - int i; - - /* Use last 2 * NR_CPUS interrupts as IPIs */ - gic_resched_int_base = gic_shared_intrs - nr_cpu_ids; - gic_call_int_base = gic_resched_int_base - nr_cpu_ids; - - for (i = 0; i < nr_cpu_ids; i++) { - gic_ipi_init_one(gic_call_int_base + i, i, &irq_call); - gic_ipi_init_one(gic_resched_int_base + i, i, &irq_resched); - } -} -#else -static inline void gic_ipi_init(void) -{ -} -#endif - static void __init gic_basic_init(void) { unsigned int i; @@ -753,19 +695,21 @@ static int gic_local_irq_domain_map(struct irq_domain *d, unsigned int virq, } static int gic_shared_irq_domain_map(struct irq_domain *d, unsigned int virq, - irq_hw_number_t hw) + irq_hw_number_t hw, unsigned int vpe) { int intr = GIC_HWIRQ_TO_SHARED(hw); unsigned long flags; + int i; irq_set_chip_and_handler(virq, &gic_level_irq_controller, handle_level_irq); spin_lock_irqsave(&gic_lock, flags); gic_map_to_pin(intr, gic_cpu_pin); - /* Map to VPE 0 by default */ - gic_map_to_vpe(intr, 0); - set_bit(intr, pcpu_masks[0].pcpu_mask); + gic_map_to_vpe(intr, vpe); + for (i = 0; i < gic_vpes; i++) + clear_bit(intr, pcpu_masks[i].pcpu_mask); + set_bit(intr, pcpu_masks[vpe].pcpu_mask); spin_unlock_irqrestore(&gic_lock, flags); return 0; @@ -776,10 +720,93 @@ static int gic_irq_domain_map(struct irq_domain *d, unsigned int virq, { if (GIC_HWIRQ_TO_LOCAL(hw) < GIC_NUM_LOCAL_INTRS) return gic_local_irq_domain_map(d, virq, hw); - return gic_shared_irq_domain_map(d, virq, hw); + return gic_shared_irq_domain_map(d, virq, hw, 0); } -static int gic_irq_domain_xlate(struct irq_domain *d, struct device_node *ctrlr, +static int gic_irq_domain_alloc(struct irq_domain *d, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct gic_irq_spec *spec = arg; + irq_hw_number_t hwirq, base_hwirq; + int cpu, ret, i; + + if (spec->type == GIC_DEVICE) { + /* verify that it doesn't conflict with an IPI irq */ + if (test_bit(spec->hwirq, ipi_resrv)) + return -EBUSY; + } else { + base_hwirq = find_first_bit(ipi_resrv, gic_shared_intrs); + if (base_hwirq == gic_shared_intrs) { + return -ENOMEM; + } + + /* check that we have enough space */ + for (i = base_hwirq; i < nr_irqs; i++) { + if (!test_bit(i, ipi_resrv)) + return -EBUSY; + } + bitmap_clear(ipi_resrv, base_hwirq, nr_irqs); + + /* map the hwirq for each cpu consecutively */ + i = 0; + for_each_cpu(cpu, spec->ipimask) { + hwirq = GIC_SHARED_TO_HWIRQ(base_hwirq + i); + + ret = irq_domain_set_hwirq_and_chip(d, virq + i, hwirq, + &gic_edge_irq_controller, + NULL); + if (ret) + goto error; + + ret = gic_shared_irq_domain_map(d, virq + i, hwirq, cpu); + if (ret) + goto error; + + i++; + } + + /* + * tell the parent about the base hwirq we allocated so it can + * set its own domain data + */ + spec->hwirq = base_hwirq; + } + + return 0; +error: + bitmap_set(ipi_resrv, base_hwirq, nr_irqs); + return ret; +} + +void gic_irq_domain_free(struct irq_domain *d, unsigned int virq, + unsigned int nr_irqs) +{ + irq_hw_number_t base_hwirq; + struct irq_data *data; + + data = irq_get_irq_data(virq); + if (!data) + return; + + base_hwirq = GIC_HWIRQ_TO_SHARED(irqd_to_hwirq(data)); + bitmap_set(ipi_resrv, base_hwirq, nr_irqs); +} + +int gic_irq_domain_match(struct irq_domain *d, struct device_node *node, + enum irq_domain_bus_token bus_token) +{ + /* this domain should'nt be accessed directly */ + return 0; +} + +static const struct irq_domain_ops gic_irq_domain_ops = { + .map = gic_irq_domain_map, + .alloc = gic_irq_domain_alloc, + .free = gic_irq_domain_free, + .match = gic_irq_domain_match, +}; + +static int gic_dev_domain_xlate(struct irq_domain *d, struct device_node *ctrlr, const u32 *intspec, unsigned int intsize, irq_hw_number_t *out_hwirq, unsigned int *out_type) @@ -798,9 +825,130 @@ static int gic_irq_domain_xlate(struct irq_domain *d, struct device_node *ctrlr, return 0; } -static const struct irq_domain_ops gic_irq_domain_ops = { - .map = gic_irq_domain_map, - .xlate = gic_irq_domain_xlate, +static int gic_dev_domain_alloc(struct irq_domain *d, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct irq_fwspec *fwspec = arg; + struct gic_irq_spec spec = { + .type = GIC_DEVICE, + .hwirq = fwspec->param[1], + }; + int i, ret; + bool is_shared = fwspec->param[0] == GIC_SHARED; + + if (is_shared) { + ret = irq_domain_alloc_irqs_parent(d, virq, nr_irqs, &spec); + if (ret) + return ret; + } + + for (i = 0; i < nr_irqs; i++) { + irq_hw_number_t hwirq; + + if (is_shared) + hwirq = GIC_SHARED_TO_HWIRQ(spec.hwirq + i); + else + hwirq = GIC_LOCAL_TO_HWIRQ(spec.hwirq + i); + + ret = irq_domain_set_hwirq_and_chip(d, virq + i, + hwirq, + &gic_level_irq_controller, + NULL); + if (ret) + return ret; + } + + return 0; +} + +void gic_dev_domain_free(struct irq_domain *d, unsigned int virq, + unsigned int nr_irqs) +{ + /* no real allocation is done for dev irqs, so no need to free anything */ + return; +} + +static struct irq_domain_ops gic_dev_domain_ops = { + .xlate = gic_dev_domain_xlate, + .alloc = gic_dev_domain_alloc, + .free = gic_dev_domain_free, +}; + +static int gic_ipi_domain_xlate(struct irq_domain *d, struct device_node *ctrlr, + const u32 *intspec, unsigned int intsize, + irq_hw_number_t *out_hwirq, + unsigned int *out_type) +{ + /* + * There's nothing to translate here. hwirq is dynamically allocated and + * the irq type is always edge triggered. + * */ + *out_hwirq = 0; + *out_type = IRQ_TYPE_EDGE_RISING; + + return 0; +} + +static int gic_ipi_domain_alloc(struct irq_domain *d, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct cpumask *ipimask = arg; + struct gic_irq_spec spec = { + .type = GIC_IPI, + .ipimask = ipimask + }; + int ret, i; + + ret = irq_domain_alloc_irqs_parent(d, virq, nr_irqs, &spec); + if (ret) + return ret; + + /* the parent should have set spec.hwirq to the base_hwirq it allocated */ + for (i = 0; i < nr_irqs; i++) { + ret = irq_domain_set_hwirq_and_chip(d, virq + i, + GIC_SHARED_TO_HWIRQ(spec.hwirq + i), + &gic_edge_irq_controller, + NULL); + if (ret) + goto error; + + ret = irq_set_irq_type(virq + i, IRQ_TYPE_EDGE_RISING); + if (ret) + goto error; + } + + return 0; +error: + irq_domain_free_irqs_parent(d, virq, nr_irqs); + return ret; +} + +void gic_ipi_domain_free(struct irq_domain *d, unsigned int virq, + unsigned int nr_irqs) +{ + irq_domain_free_irqs_parent(d, virq, nr_irqs); +} + +int gic_ipi_domain_match(struct irq_domain *d, struct device_node *node, + enum irq_domain_bus_token bus_token) +{ + bool is_ipi; + + switch (bus_token) { + case DOMAIN_BUS_IPI: + is_ipi = d->bus_token == bus_token; + return to_of_node(d->fwnode) == node && is_ipi; + break; + default: + return 0; + } +} + +static struct irq_domain_ops gic_ipi_domain_ops = { + .xlate = gic_ipi_domain_xlate, + .alloc = gic_ipi_domain_alloc, + .free = gic_ipi_domain_free, + .match = gic_ipi_domain_match, }; static void __init __gic_init(unsigned long gic_base_addr, @@ -809,6 +957,7 @@ static void __init __gic_init(unsigned long gic_base_addr, struct device_node *node) { unsigned int gicconfig; + unsigned int v[2]; __gic_base_addr = gic_base_addr; @@ -864,9 +1013,32 @@ static void __init __gic_init(unsigned long gic_base_addr, if (!gic_irq_domain) panic("Failed to add GIC IRQ domain"); - gic_basic_init(); + gic_dev_domain = irq_domain_add_hierarchy(gic_irq_domain, 0, + GIC_NUM_LOCAL_INTRS + gic_shared_intrs, + node, &gic_dev_domain_ops, NULL); + if (!gic_dev_domain) + panic("Failed to add GIC DEV domain"); - gic_ipi_init(); + gic_ipi_domain = irq_domain_add_hierarchy(gic_irq_domain, + IRQ_DOMAIN_FLAG_IPI_PER_CPU, + GIC_NUM_LOCAL_INTRS + gic_shared_intrs, + node, &gic_ipi_domain_ops, NULL); + if (!gic_ipi_domain) + panic("Failed to add GIC IPI domain"); + + gic_ipi_domain->bus_token = DOMAIN_BUS_IPI; + + if (node && + !of_property_read_u32_array(node, "mti,reserved-ipi-vectors", v, 2)) { + bitmap_set(ipi_resrv, v[0], v[1]); + } else { + /* Make the last 2 * gic_vpes available for IPIs */ + bitmap_set(ipi_resrv, + gic_shared_intrs - 2 * gic_vpes, + 2 * gic_vpes); + } + + gic_basic_init(); } void __init gic_init(unsigned long gic_base_addr, diff --git a/drivers/irqchip/irq-mvebu-odmi.c b/drivers/irqchip/irq-mvebu-odmi.c new file mode 100644 index 000000000000..b4d367868dbb --- /dev/null +++ b/drivers/irqchip/irq-mvebu-odmi.c @@ -0,0 +1,236 @@ +/* + * Copyright (C) 2016 Marvell + * + * Thomas Petazzoni + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ + +#define pr_fmt(fmt) "GIC-ODMI: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +#define GICP_ODMIN_SET 0x40 +#define GICP_ODMI_INT_NUM_SHIFT 12 +#define GICP_ODMIN_GM_EP_R0 0x110 +#define GICP_ODMIN_GM_EP_R1 0x114 +#define GICP_ODMIN_GM_EA_R0 0x108 +#define GICP_ODMIN_GM_EA_R1 0x118 + +/* + * We don't support the group events, so we simply have 8 interrupts + * per frame. + */ +#define NODMIS_SHIFT 3 +#define NODMIS_PER_FRAME (1 << NODMIS_SHIFT) +#define NODMIS_MASK (NODMIS_PER_FRAME - 1) + +struct odmi_data { + struct resource res; + void __iomem *base; + unsigned int spi_base; +}; + +static struct odmi_data *odmis; +static unsigned long *odmis_bm; +static unsigned int odmis_count; + +/* Protects odmis_bm */ +static DEFINE_SPINLOCK(odmis_bm_lock); + +static void odmi_compose_msi_msg(struct irq_data *d, struct msi_msg *msg) +{ + struct odmi_data *odmi; + phys_addr_t addr; + unsigned int odmin; + + if (WARN_ON(d->hwirq >= odmis_count * NODMIS_PER_FRAME)) + return; + + odmi = &odmis[d->hwirq >> NODMIS_SHIFT]; + odmin = d->hwirq & NODMIS_MASK; + + addr = odmi->res.start + GICP_ODMIN_SET; + + msg->address_hi = upper_32_bits(addr); + msg->address_lo = lower_32_bits(addr); + msg->data = odmin << GICP_ODMI_INT_NUM_SHIFT; +} + +static struct irq_chip odmi_irq_chip = { + .name = "ODMI", + .irq_mask = irq_chip_mask_parent, + .irq_unmask = irq_chip_unmask_parent, + .irq_eoi = irq_chip_eoi_parent, + .irq_set_affinity = irq_chip_set_affinity_parent, + .irq_compose_msi_msg = odmi_compose_msi_msg, +}; + +static int odmi_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *args) +{ + struct odmi_data *odmi = NULL; + struct irq_fwspec fwspec; + struct irq_data *d; + unsigned int hwirq, odmin; + int ret; + + spin_lock(&odmis_bm_lock); + hwirq = find_first_zero_bit(odmis_bm, NODMIS_PER_FRAME * odmis_count); + if (hwirq >= NODMIS_PER_FRAME * odmis_count) { + spin_unlock(&odmis_bm_lock); + return -ENOSPC; + } + + __set_bit(hwirq, odmis_bm); + spin_unlock(&odmis_bm_lock); + + odmi = &odmis[hwirq >> NODMIS_SHIFT]; + odmin = hwirq & NODMIS_MASK; + + fwspec.fwnode = domain->parent->fwnode; + fwspec.param_count = 3; + fwspec.param[0] = GIC_SPI; + fwspec.param[1] = odmi->spi_base - 32 + odmin; + fwspec.param[2] = IRQ_TYPE_EDGE_RISING; + + ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &fwspec); + if (ret) { + pr_err("Cannot allocate parent IRQ\n"); + spin_lock(&odmis_bm_lock); + __clear_bit(odmin, odmis_bm); + spin_unlock(&odmis_bm_lock); + return ret; + } + + /* Configure the interrupt line to be edge */ + d = irq_domain_get_irq_data(domain->parent, virq); + d->chip->irq_set_type(d, IRQ_TYPE_EDGE_RISING); + + irq_domain_set_hwirq_and_chip(domain, virq, hwirq, + &odmi_irq_chip, NULL); + + return 0; +} + +static void odmi_irq_domain_free(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + struct irq_data *d = irq_domain_get_irq_data(domain, virq); + + if (d->hwirq >= odmis_count * NODMIS_PER_FRAME) { + pr_err("Failed to teardown msi. Invalid hwirq %lu\n", d->hwirq); + return; + } + + irq_domain_free_irqs_parent(domain, virq, nr_irqs); + + /* Actually free the MSI */ + spin_lock(&odmis_bm_lock); + __clear_bit(d->hwirq, odmis_bm); + spin_unlock(&odmis_bm_lock); +} + +static const struct irq_domain_ops odmi_domain_ops = { + .alloc = odmi_irq_domain_alloc, + .free = odmi_irq_domain_free, +}; + +static struct irq_chip odmi_msi_irq_chip = { + .name = "ODMI", +}; + +static struct msi_domain_ops odmi_msi_ops = { +}; + +static struct msi_domain_info odmi_msi_domain_info = { + .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS), + .ops = &odmi_msi_ops, + .chip = &odmi_msi_irq_chip, +}; + +static int __init mvebu_odmi_init(struct device_node *node, + struct device_node *parent) +{ + struct irq_domain *inner_domain, *plat_domain; + int ret, i; + + if (of_property_read_u32(node, "marvell,odmi-frames", &odmis_count)) + return -EINVAL; + + odmis = kcalloc(odmis_count, sizeof(struct odmi_data), GFP_KERNEL); + if (!odmis) + return -ENOMEM; + + odmis_bm = kcalloc(BITS_TO_LONGS(odmis_count * NODMIS_PER_FRAME), + sizeof(long), GFP_KERNEL); + if (!odmis_bm) { + ret = -ENOMEM; + goto err_alloc; + } + + for (i = 0; i < odmis_count; i++) { + struct odmi_data *odmi = &odmis[i]; + + ret = of_address_to_resource(node, i, &odmi->res); + if (ret) + goto err_unmap; + + odmi->base = of_io_request_and_map(node, i, "odmi"); + if (IS_ERR(odmi->base)) { + ret = PTR_ERR(odmi->base); + goto err_unmap; + } + + if (of_property_read_u32_index(node, "marvell,spi-base", + i, &odmi->spi_base)) { + ret = -EINVAL; + goto err_unmap; + } + } + + inner_domain = irq_domain_create_linear(of_node_to_fwnode(node), + odmis_count * NODMIS_PER_FRAME, + &odmi_domain_ops, NULL); + if (!inner_domain) { + ret = -ENOMEM; + goto err_unmap; + } + + inner_domain->parent = irq_find_host(parent); + + plat_domain = platform_msi_create_irq_domain(of_node_to_fwnode(node), + &odmi_msi_domain_info, + inner_domain); + if (!plat_domain) { + ret = -ENOMEM; + goto err_remove_inner; + } + + return 0; + +err_remove_inner: + irq_domain_remove(inner_domain); +err_unmap: + for (i = 0; i < odmis_count; i++) { + struct odmi_data *odmi = &odmis[i]; + + if (odmi->base && !IS_ERR(odmi->base)) + iounmap(odmis[i].base); + } + kfree(odmis_bm); +err_alloc: + kfree(odmis); + return ret; +} + +IRQCHIP_DECLARE(mvebu_odmi, "marvell,odmi-controller", mvebu_odmi_init); diff --git a/drivers/irqchip/irq-mxs.c b/drivers/irqchip/irq-mxs.c index efe50845939d..17304705f2cf 100644 --- a/drivers/irqchip/irq-mxs.c +++ b/drivers/irqchip/irq-mxs.c @@ -183,7 +183,7 @@ static void __iomem * __init icoll_init_iobase(struct device_node *np) void __iomem *icoll_base; icoll_base = of_io_request_and_map(np, 0, np->name); - if (!icoll_base) + if (IS_ERR(icoll_base)) panic("%s: unable to map resource", np->full_name); return icoll_base; } diff --git a/drivers/irqchip/irq-sunxi-nmi.c b/drivers/irqchip/irq-sunxi-nmi.c index 0820f67cc9a7..668730c5cb66 100644 --- a/drivers/irqchip/irq-sunxi-nmi.c +++ b/drivers/irqchip/irq-sunxi-nmi.c @@ -160,9 +160,9 @@ static int __init sunxi_sc_nmi_irq_init(struct device_node *node, gc = irq_get_domain_generic_chip(domain, 0); gc->reg_base = of_io_request_and_map(node, 0, of_node_full_name(node)); - if (!gc->reg_base) { + if (IS_ERR(gc->reg_base)) { pr_err("unable to map resource\n"); - ret = -ENOMEM; + ret = PTR_ERR(gc->reg_base); goto fail_irqd_remove; } diff --git a/drivers/irqchip/irq-tango.c b/drivers/irqchip/irq-tango.c new file mode 100644 index 000000000000..bdbb5c0ff7fe --- /dev/null +++ b/drivers/irqchip/irq-tango.c @@ -0,0 +1,232 @@ +/* + * Copyright (C) 2014 Mans Rullgard + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define IRQ0_CTL_BASE 0x0000 +#define IRQ1_CTL_BASE 0x0100 +#define EDGE_CTL_BASE 0x0200 +#define IRQ2_CTL_BASE 0x0300 + +#define IRQ_CTL_HI 0x18 +#define EDGE_CTL_HI 0x20 + +#define IRQ_STATUS 0x00 +#define IRQ_RAWSTAT 0x04 +#define IRQ_EN_SET 0x08 +#define IRQ_EN_CLR 0x0c +#define IRQ_SOFT_SET 0x10 +#define IRQ_SOFT_CLR 0x14 + +#define EDGE_STATUS 0x00 +#define EDGE_RAWSTAT 0x04 +#define EDGE_CFG_RISE 0x08 +#define EDGE_CFG_FALL 0x0c +#define EDGE_CFG_RISE_SET 0x10 +#define EDGE_CFG_RISE_CLR 0x14 +#define EDGE_CFG_FALL_SET 0x18 +#define EDGE_CFG_FALL_CLR 0x1c + +struct tangox_irq_chip { + void __iomem *base; + unsigned long ctl; +}; + +static inline u32 intc_readl(struct tangox_irq_chip *chip, int reg) +{ + return readl_relaxed(chip->base + reg); +} + +static inline void intc_writel(struct tangox_irq_chip *chip, int reg, u32 val) +{ + writel_relaxed(val, chip->base + reg); +} + +static void tangox_dispatch_irqs(struct irq_domain *dom, unsigned int status, + int base) +{ + unsigned int hwirq; + unsigned int virq; + + while (status) { + hwirq = __ffs(status); + virq = irq_find_mapping(dom, base + hwirq); + if (virq) + generic_handle_irq(virq); + status &= ~BIT(hwirq); + } +} + +static void tangox_irq_handler(struct irq_desc *desc) +{ + struct irq_domain *dom = irq_desc_get_handler_data(desc); + struct irq_chip *host_chip = irq_desc_get_chip(desc); + struct tangox_irq_chip *chip = dom->host_data; + unsigned int status_lo, status_hi; + + chained_irq_enter(host_chip, desc); + + status_lo = intc_readl(chip, chip->ctl + IRQ_STATUS); + status_hi = intc_readl(chip, chip->ctl + IRQ_CTL_HI + IRQ_STATUS); + + tangox_dispatch_irqs(dom, status_lo, 0); + tangox_dispatch_irqs(dom, status_hi, 32); + + chained_irq_exit(host_chip, desc); +} + +static int tangox_irq_set_type(struct irq_data *d, unsigned int flow_type) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + struct tangox_irq_chip *chip = gc->domain->host_data; + struct irq_chip_regs *regs = &gc->chip_types[0].regs; + + switch (flow_type & IRQ_TYPE_SENSE_MASK) { + case IRQ_TYPE_EDGE_RISING: + intc_writel(chip, regs->type + EDGE_CFG_RISE_SET, d->mask); + intc_writel(chip, regs->type + EDGE_CFG_FALL_CLR, d->mask); + break; + + case IRQ_TYPE_EDGE_FALLING: + intc_writel(chip, regs->type + EDGE_CFG_RISE_CLR, d->mask); + intc_writel(chip, regs->type + EDGE_CFG_FALL_SET, d->mask); + break; + + case IRQ_TYPE_LEVEL_HIGH: + intc_writel(chip, regs->type + EDGE_CFG_RISE_CLR, d->mask); + intc_writel(chip, regs->type + EDGE_CFG_FALL_CLR, d->mask); + break; + + case IRQ_TYPE_LEVEL_LOW: + intc_writel(chip, regs->type + EDGE_CFG_RISE_SET, d->mask); + intc_writel(chip, regs->type + EDGE_CFG_FALL_SET, d->mask); + break; + + default: + pr_err("Invalid trigger mode %x for IRQ %d\n", + flow_type, d->irq); + return -EINVAL; + } + + return irq_setup_alt_chip(d, flow_type); +} + +static void __init tangox_irq_init_chip(struct irq_chip_generic *gc, + unsigned long ctl_offs, + unsigned long edge_offs) +{ + struct tangox_irq_chip *chip = gc->domain->host_data; + struct irq_chip_type *ct = gc->chip_types; + unsigned long ctl_base = chip->ctl + ctl_offs; + unsigned long edge_base = EDGE_CTL_BASE + edge_offs; + int i; + + gc->reg_base = chip->base; + gc->unused = 0; + + for (i = 0; i < 2; i++) { + ct[i].chip.irq_ack = irq_gc_ack_set_bit; + ct[i].chip.irq_mask = irq_gc_mask_disable_reg; + ct[i].chip.irq_mask_ack = irq_gc_mask_disable_reg_and_ack; + ct[i].chip.irq_unmask = irq_gc_unmask_enable_reg; + ct[i].chip.irq_set_type = tangox_irq_set_type; + ct[i].chip.name = gc->domain->name; + + ct[i].regs.enable = ctl_base + IRQ_EN_SET; + ct[i].regs.disable = ctl_base + IRQ_EN_CLR; + ct[i].regs.ack = edge_base + EDGE_RAWSTAT; + ct[i].regs.type = edge_base; + } + + ct[0].type = IRQ_TYPE_LEVEL_MASK; + ct[0].handler = handle_level_irq; + + ct[1].type = IRQ_TYPE_EDGE_BOTH; + ct[1].handler = handle_edge_irq; + + intc_writel(chip, ct->regs.disable, 0xffffffff); + intc_writel(chip, ct->regs.ack, 0xffffffff); +} + +static void __init tangox_irq_domain_init(struct irq_domain *dom) +{ + struct irq_chip_generic *gc; + int i; + + for (i = 0; i < 2; i++) { + gc = irq_get_domain_generic_chip(dom, i * 32); + tangox_irq_init_chip(gc, i * IRQ_CTL_HI, i * EDGE_CTL_HI); + } +} + +static int __init tangox_irq_init(void __iomem *base, struct resource *baseres, + struct device_node *node) +{ + struct tangox_irq_chip *chip; + struct irq_domain *dom; + struct resource res; + int irq; + int err; + + irq = irq_of_parse_and_map(node, 0); + if (!irq) + panic("%s: failed to get IRQ", node->name); + + err = of_address_to_resource(node, 0, &res); + if (err) + panic("%s: failed to get address", node->name); + + chip = kzalloc(sizeof(*chip), GFP_KERNEL); + chip->ctl = res.start - baseres->start; + chip->base = base; + + dom = irq_domain_add_linear(node, 64, &irq_generic_chip_ops, chip); + if (!dom) + panic("%s: failed to create irqdomain", node->name); + + err = irq_alloc_domain_generic_chips(dom, 32, 2, node->name, + handle_level_irq, 0, 0, 0); + if (err) + panic("%s: failed to allocate irqchip", node->name); + + tangox_irq_domain_init(dom); + + irq_set_chained_handler(irq, tangox_irq_handler); + irq_set_handler_data(irq, dom); + + return 0; +} + +static int __init tangox_of_irq_init(struct device_node *node, + struct device_node *parent) +{ + struct device_node *c; + struct resource res; + void __iomem *base; + + base = of_iomap(node, 0); + if (!base) + panic("%s: of_iomap failed", node->name); + + of_address_to_resource(node, 0, &res); + + for_each_child_of_node(node, c) + tangox_irq_init(base, &res, c); + + return 0; +} +IRQCHIP_DECLARE(tangox_intc, "sigma,smp8642-intc", tangox_of_irq_init); diff --git a/drivers/irqchip/irq-ts4800.c b/drivers/irqchip/irq-ts4800.c index 4192bdcd2734..2325fb3c482b 100644 --- a/drivers/irqchip/irq-ts4800.c +++ b/drivers/irqchip/irq-ts4800.c @@ -59,7 +59,7 @@ static int ts4800_irqdomain_map(struct irq_domain *d, unsigned int irq, return 0; } -struct irq_domain_ops ts4800_ic_ops = { +static const struct irq_domain_ops ts4800_ic_ops = { .map = ts4800_irqdomain_map, .xlate = irq_domain_xlate_onecell, }; diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig index fa593dd3efe1..3772f3ac956e 100644 --- a/drivers/net/ethernet/intel/Kconfig +++ b/drivers/net/ethernet/intel/Kconfig @@ -83,6 +83,15 @@ config E1000E To compile this driver as a module, choose M here. The module will be called e1000e. +config E1000E_HWTS + bool "Support HW cross-timestamp on PCH devices" + default y + depends on E1000E && X86 + ---help--- + Say Y to enable hardware supported cross-timestamping on PCH + devices. The cross-timestamp is available through the PTP clock + driver precise cross-timestamp ioctl (PTP_SYS_OFFSET_PRECISE). + config IGB tristate "Intel(R) 82575/82576 PCI-Express Gigabit Ethernet support" depends on PCI diff --git a/drivers/net/ethernet/intel/e1000e/defines.h b/drivers/net/ethernet/intel/e1000e/defines.h index f7c7804d79e5..0641c0098738 100644 --- a/drivers/net/ethernet/intel/e1000e/defines.h +++ b/drivers/net/ethernet/intel/e1000e/defines.h @@ -528,6 +528,11 @@ #define E1000_RXCW_C 0x20000000 /* Receive config */ #define E1000_RXCW_SYNCH 0x40000000 /* Receive config synch */ +/* HH Time Sync */ +#define E1000_TSYNCTXCTL_MAX_ALLOWED_DLY_MASK 0x0000F000 /* max delay */ +#define E1000_TSYNCTXCTL_SYNC_COMP 0x40000000 /* sync complete */ +#define E1000_TSYNCTXCTL_START_SYNC 0x80000000 /* initiate sync */ + #define E1000_TSYNCTXCTL_VALID 0x00000001 /* Tx timestamp valid */ #define E1000_TSYNCTXCTL_ENABLED 0x00000010 /* enable Tx timestamping */ diff --git a/drivers/net/ethernet/intel/e1000e/ptp.c b/drivers/net/ethernet/intel/e1000e/ptp.c index 25a0ad5102d6..e2ff3ef75d5d 100644 --- a/drivers/net/ethernet/intel/e1000e/ptp.c +++ b/drivers/net/ethernet/intel/e1000e/ptp.c @@ -26,6 +26,12 @@ #include "e1000.h" +#ifdef CONFIG_E1000E_HWTS +#include +#include +#include +#endif + /** * e1000e_phc_adjfreq - adjust the frequency of the hardware clock * @ptp: ptp clock structure @@ -98,6 +104,78 @@ static int e1000e_phc_adjtime(struct ptp_clock_info *ptp, s64 delta) return 0; } +#ifdef CONFIG_E1000E_HWTS +#define MAX_HW_WAIT_COUNT (3) + +/** + * e1000e_phc_get_syncdevicetime - Callback given to timekeeping code reads system/device registers + * @device: current device time + * @system: system counter value read synchronously with device time + * @ctx: context provided by timekeeping code + * + * Read device and system (ART) clock simultaneously and return the corrected + * clock values in ns. + **/ +static int e1000e_phc_get_syncdevicetime(ktime_t *device, + struct system_counterval_t *system, + void *ctx) +{ + struct e1000_adapter *adapter = (struct e1000_adapter *)ctx; + struct e1000_hw *hw = &adapter->hw; + unsigned long flags; + int i; + u32 tsync_ctrl; + cycle_t dev_cycles; + cycle_t sys_cycles; + + tsync_ctrl = er32(TSYNCTXCTL); + tsync_ctrl |= E1000_TSYNCTXCTL_START_SYNC | + E1000_TSYNCTXCTL_MAX_ALLOWED_DLY_MASK; + ew32(TSYNCTXCTL, tsync_ctrl); + for (i = 0; i < MAX_HW_WAIT_COUNT; ++i) { + udelay(1); + tsync_ctrl = er32(TSYNCTXCTL); + if (tsync_ctrl & E1000_TSYNCTXCTL_SYNC_COMP) + break; + } + + if (i == MAX_HW_WAIT_COUNT) + return -ETIMEDOUT; + + dev_cycles = er32(SYSSTMPH); + dev_cycles <<= 32; + dev_cycles |= er32(SYSSTMPL); + spin_lock_irqsave(&adapter->systim_lock, flags); + *device = ns_to_ktime(timecounter_cyc2time(&adapter->tc, dev_cycles)); + spin_unlock_irqrestore(&adapter->systim_lock, flags); + + sys_cycles = er32(PLTSTMPH); + sys_cycles <<= 32; + sys_cycles |= er32(PLTSTMPL); + *system = convert_art_to_tsc(sys_cycles); + + return 0; +} + +/** + * e1000e_phc_getsynctime - Reads the current system/device cross timestamp + * @ptp: ptp clock structure + * @cts: structure containing timestamp + * + * Read device and system (ART) clock simultaneously and return the scaled + * clock values in ns. + **/ +static int e1000e_phc_getcrosststamp(struct ptp_clock_info *ptp, + struct system_device_crosststamp *xtstamp) +{ + struct e1000_adapter *adapter = container_of(ptp, struct e1000_adapter, + ptp_clock_info); + + return get_device_system_crosststamp(e1000e_phc_get_syncdevicetime, + adapter, NULL, xtstamp); +} +#endif/*CONFIG_E1000E_HWTS*/ + /** * e1000e_phc_gettime - Reads the current time from the hardware clock * @ptp: ptp clock structure @@ -236,6 +314,13 @@ void e1000e_ptp_init(struct e1000_adapter *adapter) break; } +#ifdef CONFIG_E1000E_HWTS + /* CPU must have ART and GBe must be from Sunrise Point or greater */ + if (hw->mac.type >= e1000_pch_spt && boot_cpu_has(X86_FEATURE_ART)) + adapter->ptp_clock_info.getcrosststamp = + e1000e_phc_getcrosststamp; +#endif/*CONFIG_E1000E_HWTS*/ + INIT_DELAYED_WORK(&adapter->systim_overflow_work, e1000e_systim_overflow_work); diff --git a/drivers/net/ethernet/intel/e1000e/regs.h b/drivers/net/ethernet/intel/e1000e/regs.h index 1d5e0b77062a..0cb4d365e5ad 100644 --- a/drivers/net/ethernet/intel/e1000e/regs.h +++ b/drivers/net/ethernet/intel/e1000e/regs.h @@ -245,6 +245,10 @@ #define E1000_SYSTIML 0x0B600 /* System time register Low - RO */ #define E1000_SYSTIMH 0x0B604 /* System time register High - RO */ #define E1000_TIMINCA 0x0B608 /* Increment attributes register - RW */ +#define E1000_SYSSTMPL 0x0B648 /* HH Timesync system stamp low register */ +#define E1000_SYSSTMPH 0x0B64C /* HH Timesync system stamp hi register */ +#define E1000_PLTSTMPL 0x0B640 /* HH Timesync platform stamp low register */ +#define E1000_PLTSTMPH 0x0B644 /* HH Timesync platform stamp hi register */ #define E1000_RXMTRL 0x0B634 /* Time sync Rx EtherType and Msg Type - RW */ #define E1000_RXUDP 0x0B638 /* Time Sync Rx UDP Port - RW */ diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index da7bae991552..579fd65299a0 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "ptp_private.h" @@ -120,11 +121,13 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg) struct ptp_clock_caps caps; struct ptp_clock_request req; struct ptp_sys_offset *sysoff = NULL; + struct ptp_sys_offset_precise precise_offset; struct ptp_pin_desc pd; struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock); struct ptp_clock_info *ops = ptp->info; struct ptp_clock_time *pct; struct timespec64 ts; + struct system_device_crosststamp xtstamp; int enable, err = 0; unsigned int i, pin_index; @@ -138,6 +141,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg) caps.n_per_out = ptp->info->n_per_out; caps.pps = ptp->info->pps; caps.n_pins = ptp->info->n_pins; + caps.cross_timestamping = ptp->info->getcrosststamp != NULL; if (copy_to_user((void __user *)arg, &caps, sizeof(caps))) err = -EFAULT; break; @@ -180,6 +184,29 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg) err = ops->enable(ops, &req, enable); break; + case PTP_SYS_OFFSET_PRECISE: + if (!ptp->info->getcrosststamp) { + err = -EOPNOTSUPP; + break; + } + err = ptp->info->getcrosststamp(ptp->info, &xtstamp); + if (err) + break; + + ts = ktime_to_timespec64(xtstamp.device); + precise_offset.device.sec = ts.tv_sec; + precise_offset.device.nsec = ts.tv_nsec; + ts = ktime_to_timespec64(xtstamp.sys_realtime); + precise_offset.sys_realtime.sec = ts.tv_sec; + precise_offset.sys_realtime.nsec = ts.tv_nsec; + ts = ktime_to_timespec64(xtstamp.sys_monoraw); + precise_offset.sys_monoraw.sec = ts.tv_sec; + precise_offset.sys_monoraw.nsec = ts.tv_nsec; + if (copy_to_user((void __user *)arg, &precise_offset, + sizeof(precise_offset))) + err = -EFAULT; + break; + case PTP_SYS_OFFSET: sysoff = kmalloc(sizeof(*sysoff), GFP_KERNEL); if (!sysoff) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4545e2e2ad45..5699bbc23feb 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -931,7 +931,7 @@ static int check_async_write(struct inode *inode, unsigned long bio_flags) if (bio_flags & EXTENT_BIO_TREE_LOG) return 0; #ifdef CONFIG_X86 - if (static_cpu_has_safe(X86_FEATURE_XMM4_2)) + if (static_cpu_has(X86_FEATURE_XMM4_2)) return 0; #endif return 1; diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index bdcf358dfce2..0d442e34c349 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -190,9 +190,9 @@ extern void clockevents_config_and_register(struct clock_event_device *dev, extern int clockevents_update_freq(struct clock_event_device *ce, u32 freq); static inline void -clockevents_calc_mult_shift(struct clock_event_device *ce, u32 freq, u32 minsec) +clockevents_calc_mult_shift(struct clock_event_device *ce, u32 freq, u32 maxsec) { - return clocks_calc_mult_shift(&ce->mult, &ce->shift, NSEC_PER_SEC, freq, minsec); + return clocks_calc_mult_shift(&ce->mult, &ce->shift, NSEC_PER_SEC, freq, maxsec); } extern void clockevents_suspend(void); diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 6013021a3b39..a307bf62974f 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -118,6 +118,23 @@ struct clocksource { /* simplify initialization of mask field */ #define CLOCKSOURCE_MASK(bits) (cycle_t)((bits) < 64 ? ((1ULL<<(bits))-1) : -1) +static inline u32 clocksource_freq2mult(u32 freq, u32 shift_constant, u64 from) +{ + /* freq = cyc/from + * mult/2^shift = ns/cyc + * mult = ns/cyc * 2^shift + * mult = from/freq * 2^shift + * mult = from * 2^shift / freq + * mult = (from<member) __force *) &(p)->member)) +#else /* __CHECKER__ */ # define __user # define __kernel # define __safe @@ -44,7 +46,9 @@ extern void __chk_io_ptr(const volatile void __iomem *); # define __percpu # define __rcu # define __pmem -#endif +# define __private +# define ACCESS_PRIVATE(p, member) ((p)->member) +#endif /* __CHECKER__ */ /* Indirect macros required for expanded argument pasting, eg. __LINE__. */ #define ___PASTE(a,b) a##b diff --git a/include/linux/cpu.h b/include/linux/cpu.h index d2ca8c38f9c4..f9b1fab4388a 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -16,6 +16,7 @@ #include #include #include +#include struct device; struct device_node; @@ -27,6 +28,9 @@ struct cpu { struct device dev; }; +extern void boot_cpu_init(void); +extern void boot_cpu_state_init(void); + extern int register_cpu(struct cpu *cpu, int num); extern struct device *get_cpu_device(unsigned cpu); extern bool cpu_is_hotpluggable(unsigned cpu); @@ -74,7 +78,7 @@ enum { /* migration should happen before other stuff but after perf */ CPU_PRI_PERF = 20, CPU_PRI_MIGRATION = 10, - CPU_PRI_SMPBOOT = 9, + /* bring up workqueues before normal notifiers and down after */ CPU_PRI_WORKQUEUE_UP = 5, CPU_PRI_WORKQUEUE_DOWN = -5, @@ -97,9 +101,7 @@ enum { * Called on the new cpu, just before * enabling interrupts. Must not sleep, * must not fail */ -#define CPU_DYING_IDLE 0x000B /* CPU (unsigned)v dying, reached - * idle loop. */ -#define CPU_BROKEN 0x000C /* CPU (unsigned)v did not die properly, +#define CPU_BROKEN 0x000B /* CPU (unsigned)v did not die properly, * perhaps due to preemption. */ /* Used for CPU hotplug events occurring while tasks are frozen due to a suspend @@ -118,6 +120,7 @@ enum { #ifdef CONFIG_SMP +extern bool cpuhp_tasks_frozen; /* Need to know about CPUs going up/down? */ #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) #define cpu_notifier(fn, pri) { \ @@ -167,7 +170,6 @@ static inline void __unregister_cpu_notifier(struct notifier_block *nb) } #endif -void smpboot_thread_init(void); int cpu_up(unsigned int cpu); void notify_cpu_starting(unsigned int cpu); extern void cpu_maps_update_begin(void); @@ -177,6 +179,7 @@ extern void cpu_maps_update_done(void); #define cpu_notifier_register_done cpu_maps_update_done #else /* CONFIG_SMP */ +#define cpuhp_tasks_frozen 0 #define cpu_notifier(fn, pri) do { (void)(fn); } while (0) #define __cpu_notifier(fn, pri) do { (void)(fn); } while (0) @@ -215,10 +218,6 @@ static inline void cpu_notifier_register_done(void) { } -static inline void smpboot_thread_init(void) -{ -} - #endif /* CONFIG_SMP */ extern struct bus_type cpu_subsys; @@ -265,11 +264,6 @@ static inline int disable_nonboot_cpus(void) { return 0; } static inline void enable_nonboot_cpus(void) {} #endif /* !CONFIG_PM_SLEEP_SMP */ -enum cpuhp_state { - CPUHP_OFFLINE, - CPUHP_ONLINE, -}; - void cpu_startup_entry(enum cpuhp_state state); void cpu_idle_poll_ctrl(bool enable); @@ -280,14 +274,15 @@ void arch_cpu_idle_enter(void); void arch_cpu_idle_exit(void); void arch_cpu_idle_dead(void); -DECLARE_PER_CPU(bool, cpu_dead_idle); - int cpu_report_state(int cpu); int cpu_check_up_prepare(int cpu); void cpu_set_state_online(int cpu); #ifdef CONFIG_HOTPLUG_CPU bool cpu_wait_death(unsigned int cpu, int seconds); bool cpu_report_death(void); +void cpuhp_report_idle_dead(void); +#else +static inline void cpuhp_report_idle_dead(void) { } #endif /* #ifdef CONFIG_HOTPLUG_CPU */ #endif /* _LINUX_CPU_H_ */ diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h new file mode 100644 index 000000000000..5d68e15e46b7 --- /dev/null +++ b/include/linux/cpuhotplug.h @@ -0,0 +1,93 @@ +#ifndef __CPUHOTPLUG_H +#define __CPUHOTPLUG_H + +enum cpuhp_state { + CPUHP_OFFLINE, + CPUHP_CREATE_THREADS, + CPUHP_NOTIFY_PREPARE, + CPUHP_BRINGUP_CPU, + CPUHP_AP_IDLE_DEAD, + CPUHP_AP_OFFLINE, + CPUHP_AP_NOTIFY_STARTING, + CPUHP_AP_ONLINE, + CPUHP_TEARDOWN_CPU, + CPUHP_AP_ONLINE_IDLE, + CPUHP_AP_SMPBOOT_THREADS, + CPUHP_AP_NOTIFY_ONLINE, + CPUHP_AP_ONLINE_DYN, + CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 30, + CPUHP_ONLINE, +}; + +int __cpuhp_setup_state(enum cpuhp_state state, const char *name, bool invoke, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu)); + +/** + * cpuhp_setup_state - Setup hotplug state callbacks with calling the callbacks + * @state: The state for which the calls are installed + * @name: Name of the callback (will be used in debug output) + * @startup: startup callback function + * @teardown: teardown callback function + * + * Installs the callback functions and invokes the startup callback on + * the present cpus which have already reached the @state. + */ +static inline int cpuhp_setup_state(enum cpuhp_state state, + const char *name, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu)) +{ + return __cpuhp_setup_state(state, name, true, startup, teardown); +} + +/** + * cpuhp_setup_state_nocalls - Setup hotplug state callbacks without calling the + * callbacks + * @state: The state for which the calls are installed + * @name: Name of the callback. + * @startup: startup callback function + * @teardown: teardown callback function + * + * Same as @cpuhp_setup_state except that no calls are executed are invoked + * during installation of this callback. NOP if SMP=n or HOTPLUG_CPU=n. + */ +static inline int cpuhp_setup_state_nocalls(enum cpuhp_state state, + const char *name, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu)) +{ + return __cpuhp_setup_state(state, name, false, startup, teardown); +} + +void __cpuhp_remove_state(enum cpuhp_state state, bool invoke); + +/** + * cpuhp_remove_state - Remove hotplug state callbacks and invoke the teardown + * @state: The state for which the calls are removed + * + * Removes the callback functions and invokes the teardown callback on + * the present cpus which have already reached the @state. + */ +static inline void cpuhp_remove_state(enum cpuhp_state state) +{ + __cpuhp_remove_state(state, true); +} + +/** + * cpuhp_remove_state_nocalls - Remove hotplug state callbacks without invoking + * teardown + * @state: The state for which the calls are removed + */ +static inline void cpuhp_remove_state_nocalls(enum cpuhp_state state) +{ + __cpuhp_remove_state(state, false); +} + +#ifdef CONFIG_SMP +void cpuhp_online_idle(enum cpuhp_state state); +#else +static inline void cpuhp_online_idle(enum cpuhp_state state) { } +#endif + +#endif diff --git a/include/linux/irq.h b/include/linux/irq.h index 3c1c96786248..c4de62348ff2 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -133,17 +133,23 @@ struct irq_domain; * Use accessor functions to deal with it * @node: node index useful for balancing * @handler_data: per-IRQ data for the irq_chip methods - * @affinity: IRQ affinity on SMP + * @affinity: IRQ affinity on SMP. If this is an IPI + * related irq, then this is the mask of the + * CPUs to which an IPI can be sent. * @msi_desc: MSI descriptor + * @ipi_offset: Offset of first IPI target cpu in @affinity. Optional. */ struct irq_common_data { - unsigned int state_use_accessors; + unsigned int __private state_use_accessors; #ifdef CONFIG_NUMA unsigned int node; #endif void *handler_data; struct msi_desc *msi_desc; cpumask_var_t affinity; +#ifdef CONFIG_GENERIC_IRQ_IPI + unsigned int ipi_offset; +#endif }; /** @@ -208,7 +214,7 @@ enum { IRQD_FORWARDED_TO_VCPU = (1 << 20), }; -#define __irqd_to_state(d) ((d)->common->state_use_accessors) +#define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) static inline bool irqd_is_setaffinity_pending(struct irq_data *d) { @@ -299,6 +305,8 @@ static inline void irqd_clr_forwarded_to_vcpu(struct irq_data *d) __irqd_to_state(d) &= ~IRQD_FORWARDED_TO_VCPU; } +#undef __irqd_to_state + static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) { return d->hwirq; @@ -341,6 +349,8 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) * @irq_get_irqchip_state: return the internal state of an interrupt * @irq_set_irqchip_state: set the internal state of a interrupt * @irq_set_vcpu_affinity: optional to target a vCPU in a virtual machine + * @ipi_send_single: send a single IPI to destination cpus + * @ipi_send_mask: send an IPI to destination cpus in cpumask * @flags: chip specific flags */ struct irq_chip { @@ -385,6 +395,9 @@ struct irq_chip { int (*irq_set_vcpu_affinity)(struct irq_data *data, void *vcpu_info); + void (*ipi_send_single)(struct irq_data *data, unsigned int cpu); + void (*ipi_send_mask)(struct irq_data *data, const struct cpumask *dest); + unsigned long flags; }; @@ -934,4 +947,12 @@ static inline u32 irq_reg_readl(struct irq_chip_generic *gc, return readl(gc->reg_base + reg_offset); } +/* Contrary to Linux irqs, for hardware irqs the irq number 0 is valid */ +#define INVALID_HWIRQ (~0UL) +irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu); +int __ipi_send_single(struct irq_desc *desc, unsigned int cpu); +int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest); +int ipi_send_single(unsigned int virq, unsigned int cpu); +int ipi_send_mask(unsigned int virq, const struct cpumask *dest); + #endif /* _LINUX_IRQ_H */ diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h index ce824db48d64..80f89e4a29ac 100644 --- a/include/linux/irqchip/mips-gic.h +++ b/include/linux/irqchip/mips-gic.h @@ -261,9 +261,6 @@ extern void gic_write_compare(cycle_t cnt); extern void gic_write_cpu_compare(cycle_t cnt, int cpu); extern void gic_start_count(void); extern void gic_stop_count(void); -extern void gic_send_ipi(unsigned int intr); -extern unsigned int plat_ipi_call_int_xlate(unsigned int); -extern unsigned int plat_ipi_resched_int_xlate(unsigned int); extern int gic_get_c0_compare_int(void); extern int gic_get_c0_perfcount_int(void); extern int gic_get_c0_fdc_int(void); diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 04579d9fbce4..ed48594e96d2 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -74,6 +74,7 @@ enum irq_domain_bus_token { DOMAIN_BUS_PCI_MSI, DOMAIN_BUS_PLATFORM_MSI, DOMAIN_BUS_NEXUS, + DOMAIN_BUS_IPI, }; /** @@ -172,6 +173,12 @@ enum { /* Core calls alloc/free recursive through the domain hierarchy. */ IRQ_DOMAIN_FLAG_AUTO_RECURSIVE = (1 << 1), + /* Irq domain is an IPI domain with virq per cpu */ + IRQ_DOMAIN_FLAG_IPI_PER_CPU = (1 << 2), + + /* Irq domain is an IPI domain with single virq */ + IRQ_DOMAIN_FLAG_IPI_SINGLE = (1 << 3), + /* * Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved * for implementation specific purposes and ignored by the @@ -206,6 +213,8 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, extern struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode, enum irq_domain_bus_token bus_token); extern void irq_set_default_host(struct irq_domain *host); +extern int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, + irq_hw_number_t hwirq, int node); static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node) { @@ -335,6 +344,11 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d, struct device_node *ctrlr, const u32 *intspec, unsigned int intsize, irq_hw_number_t *out_hwirq, unsigned int *out_type); +/* IPI functions */ +unsigned int irq_reserve_ipi(struct irq_domain *domain, + const struct cpumask *dest); +void irq_destroy_ipi(unsigned int irq); + /* V2 interfaces to support hierarchy IRQ domains. */ extern struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, unsigned int virq); @@ -400,6 +414,22 @@ static inline bool irq_domain_is_hierarchy(struct irq_domain *domain) { return domain->flags & IRQ_DOMAIN_FLAG_HIERARCHY; } + +static inline bool irq_domain_is_ipi(struct irq_domain *domain) +{ + return domain->flags & + (IRQ_DOMAIN_FLAG_IPI_PER_CPU | IRQ_DOMAIN_FLAG_IPI_SINGLE); +} + +static inline bool irq_domain_is_ipi_per_cpu(struct irq_domain *domain) +{ + return domain->flags & IRQ_DOMAIN_FLAG_IPI_PER_CPU; +} + +static inline bool irq_domain_is_ipi_single(struct irq_domain *domain) +{ + return domain->flags & IRQ_DOMAIN_FLAG_IPI_SINGLE; +} #else /* CONFIG_IRQ_DOMAIN_HIERARCHY */ static inline void irq_domain_activate_irq(struct irq_data *data) { } static inline void irq_domain_deactivate_irq(struct irq_data *data) { } @@ -413,6 +443,21 @@ static inline bool irq_domain_is_hierarchy(struct irq_domain *domain) { return false; } + +static inline bool irq_domain_is_ipi(struct irq_domain *domain) +{ + return false; +} + +static inline bool irq_domain_is_ipi_per_cpu(struct irq_domain *domain) +{ + return false; +} + +static inline bool irq_domain_is_ipi_single(struct irq_domain *domain) +{ + return false; +} #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ #else /* CONFIG_IRQ_DOMAIN */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 2b6e22782699..3579d1e2fe3a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2139,6 +2139,8 @@ int remap_pfn_range(struct vm_area_struct *, unsigned long addr, int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); +int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, pgprot_t pgprot); int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn); int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 624b78b848b8..944b2b37313b 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -566,10 +566,26 @@ static inline void clear_tlb_flush_pending(struct mm_struct *mm) } #endif -struct vm_special_mapping -{ - const char *name; +struct vm_fault; + +struct vm_special_mapping { + const char *name; /* The name, e.g. "[vdso]". */ + + /* + * If .fault is not provided, this points to a + * NULL-terminated array of pages that back the special mapping. + * + * This must not be NULL unless .fault is provided. + */ struct page **pages; + + /* + * If non-NULL, then this is called to resolve page faults + * on the special mapping. If used, .pages is not checked. + */ + int (*fault)(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, + struct vm_fault *vmf); }; enum tlb_flush_reason { diff --git a/include/linux/notifier.h b/include/linux/notifier.h index d14a4c362465..4149868de4e6 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -47,6 +47,8 @@ * runtime initialization. */ +struct notifier_block; + typedef int (*notifier_fn_t)(struct notifier_block *nb, unsigned long action, void *data); diff --git a/include/linux/pps_kernel.h b/include/linux/pps_kernel.h index 54bf1484d41f..35ac903956c7 100644 --- a/include/linux/pps_kernel.h +++ b/include/linux/pps_kernel.h @@ -111,22 +111,17 @@ static inline void timespec_to_pps_ktime(struct pps_ktime *kt, kt->nsec = ts.tv_nsec; } +static inline void pps_get_ts(struct pps_event_time *ts) +{ + struct system_time_snapshot snap; + + ktime_get_snapshot(&snap); + ts->ts_real = ktime_to_timespec64(snap.real); #ifdef CONFIG_NTP_PPS - -static inline void pps_get_ts(struct pps_event_time *ts) -{ - ktime_get_raw_and_real_ts64(&ts->ts_raw, &ts->ts_real); + ts->ts_raw = ktime_to_timespec64(snap.raw); +#endif } -#else /* CONFIG_NTP_PPS */ - -static inline void pps_get_ts(struct pps_event_time *ts) -{ - ktime_get_real_ts64(&ts->ts_real); -} - -#endif /* CONFIG_NTP_PPS */ - /* Subtract known time delay from PPS event time(s) */ static inline void pps_sub_ts(struct pps_event_time *ts, struct timespec64 delta) { diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index b8b73066d137..6b15e168148a 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -38,6 +38,7 @@ struct ptp_clock_request { }; }; +struct system_device_crosststamp; /** * struct ptp_clock_info - decribes a PTP hardware clock * @@ -67,6 +68,11 @@ struct ptp_clock_request { * @gettime64: Reads the current time from the hardware clock. * parameter ts: Holds the result. * + * @getcrosststamp: Reads the current time from the hardware clock and + * system clock simultaneously. + * parameter cts: Contains timestamp (device,system) pair, + * where system time is realtime and monotonic. + * * @settime64: Set the current time on the hardware clock. * parameter ts: Time value to set. * @@ -105,6 +111,8 @@ struct ptp_clock_info { int (*adjfreq)(struct ptp_clock_info *ptp, s32 delta); int (*adjtime)(struct ptp_clock_info *ptp, s64 delta); int (*gettime64)(struct ptp_clock_info *ptp, struct timespec64 *ts); + int (*getcrosststamp)(struct ptp_clock_info *ptp, + struct system_device_crosststamp *cts); int (*settime64)(struct ptp_clock_info *p, const struct timespec64 *ts); int (*enable)(struct ptp_clock_info *ptp, struct ptp_clock_request *request, int on); diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 14e6f47ee16f..2657aff2725b 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -332,9 +332,7 @@ void rcu_init(void); void rcu_sched_qs(void); void rcu_bh_qs(void); void rcu_check_callbacks(int user); -struct notifier_block; -int rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu); +void rcu_report_dead(unsigned int cpu); #ifndef CONFIG_TINY_RCU void rcu_end_inkernel_boot(void); @@ -360,8 +358,6 @@ void rcu_user_exit(void); #else static inline void rcu_user_enter(void) { } static inline void rcu_user_exit(void) { } -static inline void rcu_user_hooks_switch(struct task_struct *prev, - struct task_struct *next) { } #endif /* CONFIG_NO_HZ_FULL */ #ifdef CONFIG_RCU_NOCB_CPU diff --git a/include/linux/srcu.h b/include/linux/srcu.h index f5f80c5643ac..dc8eb63c6568 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -99,8 +99,23 @@ void process_srcu(struct work_struct *work); } /* - * define and init a srcu struct at build time. - * dont't call init_srcu_struct() nor cleanup_srcu_struct() on it. + * Define and initialize a srcu struct at build time. + * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it. + * + * Note that although DEFINE_STATIC_SRCU() hides the name from other + * files, the per-CPU variable rules nevertheless require that the + * chosen name be globally unique. These rules also prohibit use of + * DEFINE_STATIC_SRCU() within a function. If these rules are too + * restrictive, declare the srcu_struct manually. For example, in + * each file: + * + * static struct srcu_struct my_srcu; + * + * Then, before the first use of each my_srcu, manually initialize it: + * + * init_srcu_struct(&my_srcu); + * + * See include/linux/percpu-defs.h for the rules on per-CPU variables. */ #define __DEFINE_SRCU(name, is_static) \ static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\ diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index 25247220b4b7..e88005459035 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -50,6 +50,7 @@ struct tk_read_base { * @offs_tai: Offset clock monotonic -> clock tai * @tai_offset: The current UTC to TAI offset in seconds * @clock_was_set_seq: The sequence number of clock was set events + * @cs_was_changed_seq: The sequence number of clocksource change events * @next_leap_ktime: CLOCK_MONOTONIC time value of a pending leap-second * @raw_time: Monotonic raw base time in timespec64 format * @cycle_interval: Number of clock cycles in one NTP interval @@ -91,6 +92,7 @@ struct timekeeper { ktime_t offs_tai; s32 tai_offset; unsigned int clock_was_set_seq; + u8 cs_was_changed_seq; ktime_t next_leap_ktime; struct timespec64 raw_time; diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index ec89d846324c..96f37bee3bc1 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -266,6 +266,64 @@ extern void timekeeping_inject_sleeptime64(struct timespec64 *delta); extern void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, struct timespec64 *ts_real); +/* + * struct system_time_snapshot - simultaneous raw/real time capture with + * counter value + * @cycles: Clocksource counter value to produce the system times + * @real: Realtime system time + * @raw: Monotonic raw system time + * @clock_was_set_seq: The sequence number of clock was set events + * @cs_was_changed_seq: The sequence number of clocksource change events + */ +struct system_time_snapshot { + cycle_t cycles; + ktime_t real; + ktime_t raw; + unsigned int clock_was_set_seq; + u8 cs_was_changed_seq; +}; + +/* + * struct system_device_crosststamp - system/device cross-timestamp + * (syncronized capture) + * @device: Device time + * @sys_realtime: Realtime simultaneous with device time + * @sys_monoraw: Monotonic raw simultaneous with device time + */ +struct system_device_crosststamp { + ktime_t device; + ktime_t sys_realtime; + ktime_t sys_monoraw; +}; + +/* + * struct system_counterval_t - system counter value with the pointer to the + * corresponding clocksource + * @cycles: System counter value + * @cs: Clocksource corresponding to system counter value. Used by + * timekeeping code to verify comparibility of two cycle values + */ +struct system_counterval_t { + cycle_t cycles; + struct clocksource *cs; +}; + +/* + * Get cross timestamp between system clock and device clock + */ +extern int get_device_system_crosststamp( + int (*get_time_fn)(ktime_t *device_time, + struct system_counterval_t *system_counterval, + void *ctx), + void *ctx, + struct system_time_snapshot *history, + struct system_device_crosststamp *xtstamp); + +/* + * Simultaneously snapshot realtime and monotonic raw clocks + */ +extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot); + /* * Persistent clock related interfaces */ diff --git a/include/trace/events/cpuhp.h b/include/trace/events/cpuhp.h new file mode 100644 index 000000000000..a72bd93ec7e5 --- /dev/null +++ b/include/trace/events/cpuhp.h @@ -0,0 +1,66 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM cpuhp + +#if !defined(_TRACE_CPUHP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_CPUHP_H + +#include + +TRACE_EVENT(cpuhp_enter, + + TP_PROTO(unsigned int cpu, + int target, + int idx, + int (*fun)(unsigned int)), + + TP_ARGS(cpu, target, idx, fun), + + TP_STRUCT__entry( + __field( unsigned int, cpu ) + __field( int, target ) + __field( int, idx ) + __field( void *, fun ) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->target = target; + __entry->idx = idx; + __entry->fun = fun; + ), + + TP_printk("cpu: %04u target: %3d step: %3d (%pf)", + __entry->cpu, __entry->target, __entry->idx, __entry->fun) +); + +TRACE_EVENT(cpuhp_exit, + + TP_PROTO(unsigned int cpu, + int state, + int idx, + int ret), + + TP_ARGS(cpu, state, idx, ret), + + TP_STRUCT__entry( + __field( unsigned int, cpu ) + __field( int, state ) + __field( int, idx ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->state = state; + __entry->idx = idx; + __entry->ret = ret; + ), + + TP_printk(" cpu: %04u state: %3d step: %3d ret: %d", + __entry->cpu, __entry->state, __entry->idx, __entry->ret) +); + +#endif + +/* This part must be outside protection */ +#include diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h index f0b7bfe5da92..ac6dded80ffa 100644 --- a/include/uapi/linux/ptp_clock.h +++ b/include/uapi/linux/ptp_clock.h @@ -51,7 +51,9 @@ struct ptp_clock_caps { int n_per_out; /* Number of programmable periodic signals. */ int pps; /* Whether the clock supports a PPS callback. */ int n_pins; /* Number of input/output pins. */ - int rsv[14]; /* Reserved for future use. */ + /* Whether the clock supports precise system-device cross timestamps */ + int cross_timestamping; + int rsv[13]; /* Reserved for future use. */ }; struct ptp_extts_request { @@ -81,6 +83,13 @@ struct ptp_sys_offset { struct ptp_clock_time ts[2 * PTP_MAX_SAMPLES + 1]; }; +struct ptp_sys_offset_precise { + struct ptp_clock_time device; + struct ptp_clock_time sys_realtime; + struct ptp_clock_time sys_monoraw; + unsigned int rsv[4]; /* Reserved for future use. */ +}; + enum ptp_pin_function { PTP_PF_NONE, PTP_PF_EXTTS, @@ -124,6 +133,8 @@ struct ptp_pin_desc { #define PTP_SYS_OFFSET _IOW(PTP_CLK_MAGIC, 5, struct ptp_sys_offset) #define PTP_PIN_GETFUNC _IOWR(PTP_CLK_MAGIC, 6, struct ptp_pin_desc) #define PTP_PIN_SETFUNC _IOW(PTP_CLK_MAGIC, 7, struct ptp_pin_desc) +#define PTP_SYS_OFFSET_PRECISE \ + _IOWR(PTP_CLK_MAGIC, 8, struct ptp_sys_offset_precise) struct ptp_extts_event { struct ptp_clock_time t; /* Time event occured. */ diff --git a/init/main.c b/init/main.c index 7c27de4577ed..8dc93df20f7f 100644 --- a/init/main.c +++ b/init/main.c @@ -385,7 +385,6 @@ static noinline void __init_refok rest_init(void) int pid; rcu_scheduler_starting(); - smpboot_thread_init(); /* * We need to spawn init first so that it obtains pid 1, however * the init task will end up wanting to create kthreads, which, if @@ -449,20 +448,6 @@ void __init parse_early_param(void) done = 1; } -/* - * Activate the first processor. - */ - -static void __init boot_cpu_init(void) -{ - int cpu = smp_processor_id(); - /* Mark the boot cpu "present", "online" etc for SMP and UP case */ - set_cpu_online(cpu, true); - set_cpu_active(cpu, true); - set_cpu_present(cpu, true); - set_cpu_possible(cpu, true); -} - void __init __weak smp_setup_processor_id(void) { } @@ -522,6 +507,7 @@ asmlinkage __visible void __init start_kernel(void) setup_command_line(command_line); setup_nr_cpu_ids(); setup_per_cpu_areas(); + boot_cpu_state_init(); smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ build_all_zonelists(NULL, NULL); diff --git a/kernel/cpu.c b/kernel/cpu.c index 5b9d39633ce9..6ea42e8da861 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -22,13 +22,88 @@ #include #include #include +#include + #include +#define CREATE_TRACE_POINTS +#include #include "smpboot.h" +/** + * cpuhp_cpu_state - Per cpu hotplug state storage + * @state: The current cpu state + * @target: The target state + * @thread: Pointer to the hotplug thread + * @should_run: Thread should execute + * @cb_stat: The state for a single callback (install/uninstall) + * @cb: Single callback function (install/uninstall) + * @result: Result of the operation + * @done: Signal completion to the issuer of the task + */ +struct cpuhp_cpu_state { + enum cpuhp_state state; + enum cpuhp_state target; +#ifdef CONFIG_SMP + struct task_struct *thread; + bool should_run; + enum cpuhp_state cb_state; + int (*cb)(unsigned int cpu); + int result; + struct completion done; +#endif +}; + +static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); + +/** + * cpuhp_step - Hotplug state machine step + * @name: Name of the step + * @startup: Startup function of the step + * @teardown: Teardown function of the step + * @skip_onerr: Do not invoke the functions on error rollback + * Will go away once the notifiers are gone + * @cant_stop: Bringup/teardown can't be stopped at this step + */ +struct cpuhp_step { + const char *name; + int (*startup)(unsigned int cpu); + int (*teardown)(unsigned int cpu); + bool skip_onerr; + bool cant_stop; +}; + +static DEFINE_MUTEX(cpuhp_state_mutex); +static struct cpuhp_step cpuhp_bp_states[]; +static struct cpuhp_step cpuhp_ap_states[]; + +/** + * cpuhp_invoke_callback _ Invoke the callbacks for a given state + * @cpu: The cpu for which the callback should be invoked + * @step: The step in the state machine + * @cb: The callback function to invoke + * + * Called from cpu hotplug and from the state register machinery + */ +static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state step, + int (*cb)(unsigned int)) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int ret = 0; + + if (cb) { + trace_cpuhp_enter(cpu, st->target, step, cb); + ret = cb(cpu); + trace_cpuhp_exit(cpu, st->state, step, ret); + } + return ret; +} + #ifdef CONFIG_SMP /* Serializes the updates to cpu_online_mask, cpu_present_mask */ static DEFINE_MUTEX(cpu_add_remove_lock); +bool cpuhp_tasks_frozen; +EXPORT_SYMBOL_GPL(cpuhp_tasks_frozen); /* * The following two APIs (cpu_maps_update_begin/done) must be used when @@ -207,31 +282,281 @@ int __register_cpu_notifier(struct notifier_block *nb) return raw_notifier_chain_register(&cpu_chain, nb); } -static int __cpu_notify(unsigned long val, void *v, int nr_to_call, +static int __cpu_notify(unsigned long val, unsigned int cpu, int nr_to_call, int *nr_calls) { + unsigned long mod = cpuhp_tasks_frozen ? CPU_TASKS_FROZEN : 0; + void *hcpu = (void *)(long)cpu; + int ret; - ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call, + ret = __raw_notifier_call_chain(&cpu_chain, val | mod, hcpu, nr_to_call, nr_calls); return notifier_to_errno(ret); } -static int cpu_notify(unsigned long val, void *v) +static int cpu_notify(unsigned long val, unsigned int cpu) { - return __cpu_notify(val, v, -1, NULL); + return __cpu_notify(val, cpu, -1, NULL); +} + +/* Notifier wrappers for transitioning to state machine */ +static int notify_prepare(unsigned int cpu) +{ + int nr_calls = 0; + int ret; + + ret = __cpu_notify(CPU_UP_PREPARE, cpu, -1, &nr_calls); + if (ret) { + nr_calls--; + printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n", + __func__, cpu); + __cpu_notify(CPU_UP_CANCELED, cpu, nr_calls, NULL); + } + return ret; +} + +static int notify_online(unsigned int cpu) +{ + cpu_notify(CPU_ONLINE, cpu); + return 0; +} + +static int notify_starting(unsigned int cpu) +{ + cpu_notify(CPU_STARTING, cpu); + return 0; +} + +static int bringup_wait_for_ap(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + + wait_for_completion(&st->done); + return st->result; +} + +static int bringup_cpu(unsigned int cpu) +{ + struct task_struct *idle = idle_thread_get(cpu); + int ret; + + /* Arch-specific enabling code. */ + ret = __cpu_up(cpu, idle); + if (ret) { + cpu_notify(CPU_UP_CANCELED, cpu); + return ret; + } + ret = bringup_wait_for_ap(cpu); + BUG_ON(!cpu_online(cpu)); + return ret; +} + +/* + * Hotplug state machine related functions + */ +static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st, + struct cpuhp_step *steps) +{ + for (st->state++; st->state < st->target; st->state++) { + struct cpuhp_step *step = steps + st->state; + + if (!step->skip_onerr) + cpuhp_invoke_callback(cpu, st->state, step->startup); + } +} + +static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, + struct cpuhp_step *steps, enum cpuhp_state target) +{ + enum cpuhp_state prev_state = st->state; + int ret = 0; + + for (; st->state > target; st->state--) { + struct cpuhp_step *step = steps + st->state; + + ret = cpuhp_invoke_callback(cpu, st->state, step->teardown); + if (ret) { + st->target = prev_state; + undo_cpu_down(cpu, st, steps); + break; + } + } + return ret; +} + +static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st, + struct cpuhp_step *steps) +{ + for (st->state--; st->state > st->target; st->state--) { + struct cpuhp_step *step = steps + st->state; + + if (!step->skip_onerr) + cpuhp_invoke_callback(cpu, st->state, step->teardown); + } +} + +static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, + struct cpuhp_step *steps, enum cpuhp_state target) +{ + enum cpuhp_state prev_state = st->state; + int ret = 0; + + while (st->state < target) { + struct cpuhp_step *step; + + st->state++; + step = steps + st->state; + ret = cpuhp_invoke_callback(cpu, st->state, step->startup); + if (ret) { + st->target = prev_state; + undo_cpu_up(cpu, st, steps); + break; + } + } + return ret; +} + +/* + * The cpu hotplug threads manage the bringup and teardown of the cpus + */ +static void cpuhp_create(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + + init_completion(&st->done); +} + +static int cpuhp_should_run(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); + + return st->should_run; +} + +/* Execute the teardown callbacks. Used to be CPU_DOWN_PREPARE */ +static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st) +{ + enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU); + + return cpuhp_down_callbacks(cpu, st, cpuhp_ap_states, target); +} + +/* Execute the online startup callbacks. Used to be CPU_ONLINE */ +static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st) +{ + return cpuhp_up_callbacks(cpu, st, cpuhp_ap_states, st->target); +} + +/* + * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke + * callbacks when a state gets [un]installed at runtime. + */ +static void cpuhp_thread_fun(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); + int ret = 0; + + /* + * Paired with the mb() in cpuhp_kick_ap_work and + * cpuhp_invoke_ap_callback, so the work set is consistent visible. + */ + smp_mb(); + if (!st->should_run) + return; + + st->should_run = false; + + /* Single callback invocation for [un]install ? */ + if (st->cb) { + if (st->cb_state < CPUHP_AP_ONLINE) { + local_irq_disable(); + ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb); + local_irq_enable(); + } else { + ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb); + } + } else { + /* Cannot happen .... */ + BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); + + /* Regular hotplug work */ + if (st->state < st->target) + ret = cpuhp_ap_online(cpu, st); + else if (st->state > st->target) + ret = cpuhp_ap_offline(cpu, st); + } + st->result = ret; + complete(&st->done); +} + +/* Invoke a single callback on a remote cpu */ +static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, + int (*cb)(unsigned int)) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + + if (!cpu_online(cpu)) + return 0; + + st->cb_state = state; + st->cb = cb; + /* + * Make sure the above stores are visible before should_run becomes + * true. Paired with the mb() above in cpuhp_thread_fun() + */ + smp_mb(); + st->should_run = true; + wake_up_process(st->thread); + wait_for_completion(&st->done); + return st->result; +} + +/* Regular hotplug invocation of the AP hotplug thread */ +static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st) +{ + st->result = 0; + st->cb = NULL; + /* + * Make sure the above stores are visible before should_run becomes + * true. Paired with the mb() above in cpuhp_thread_fun() + */ + smp_mb(); + st->should_run = true; + wake_up_process(st->thread); +} + +static int cpuhp_kick_ap_work(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + enum cpuhp_state state = st->state; + + trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work); + __cpuhp_kick_ap_work(st); + wait_for_completion(&st->done); + trace_cpuhp_exit(cpu, st->state, state, st->result); + return st->result; +} + +static struct smp_hotplug_thread cpuhp_threads = { + .store = &cpuhp_state.thread, + .create = &cpuhp_create, + .thread_should_run = cpuhp_should_run, + .thread_fn = cpuhp_thread_fun, + .thread_comm = "cpuhp/%u", + .selfparking = true, +}; + +void __init cpuhp_threads_init(void) +{ + BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads)); + kthread_unpark(this_cpu_read(cpuhp_state.thread)); } #ifdef CONFIG_HOTPLUG_CPU - -static void cpu_notify_nofail(unsigned long val, void *v) -{ - BUG_ON(cpu_notify(val, v)); -} EXPORT_SYMBOL(register_cpu_notifier); EXPORT_SYMBOL(__register_cpu_notifier); - void unregister_cpu_notifier(struct notifier_block *nb) { cpu_maps_update_begin(); @@ -311,57 +636,60 @@ static inline void check_for_tasks(int dead_cpu) read_unlock(&tasklist_lock); } -struct take_cpu_down_param { - unsigned long mod; - void *hcpu; -}; +static void cpu_notify_nofail(unsigned long val, unsigned int cpu) +{ + BUG_ON(cpu_notify(val, cpu)); +} + +static int notify_down_prepare(unsigned int cpu) +{ + int err, nr_calls = 0; + + err = __cpu_notify(CPU_DOWN_PREPARE, cpu, -1, &nr_calls); + if (err) { + nr_calls--; + __cpu_notify(CPU_DOWN_FAILED, cpu, nr_calls, NULL); + pr_warn("%s: attempt to take down CPU %u failed\n", + __func__, cpu); + } + return err; +} + +static int notify_dying(unsigned int cpu) +{ + cpu_notify(CPU_DYING, cpu); + return 0; +} /* Take this CPU down. */ static int take_cpu_down(void *_param) { - struct take_cpu_down_param *param = _param; - int err; + struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); + enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE); + int err, cpu = smp_processor_id(); /* Ensure this CPU doesn't handle any more interrupts. */ err = __cpu_disable(); if (err < 0) return err; - cpu_notify(CPU_DYING | param->mod, param->hcpu); + /* Invoke the former CPU_DYING callbacks */ + for (; st->state > target; st->state--) { + struct cpuhp_step *step = cpuhp_ap_states + st->state; + + cpuhp_invoke_callback(cpu, st->state, step->teardown); + } /* Give up timekeeping duties */ tick_handover_do_timer(); /* Park the stopper thread */ - stop_machine_park((long)param->hcpu); + stop_machine_park(cpu); return 0; } -/* Requires cpu_add_remove_lock to be held */ -static int _cpu_down(unsigned int cpu, int tasks_frozen) +static int takedown_cpu(unsigned int cpu) { - int err, nr_calls = 0; - void *hcpu = (void *)(long)cpu; - unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; - struct take_cpu_down_param tcd_param = { - .mod = mod, - .hcpu = hcpu, - }; - - if (num_online_cpus() == 1) - return -EBUSY; - - if (!cpu_online(cpu)) - return -EINVAL; - - cpu_hotplug_begin(); - - err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); - if (err) { - nr_calls--; - __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); - pr_warn("%s: attempt to take down CPU %u failed\n", - __func__, cpu); - goto out_release; - } + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int err; /* * By now we've cleared cpu_active_mask, wait for all preempt-disabled @@ -378,6 +706,8 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) else synchronize_rcu(); + /* Park the smpboot threads */ + kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread); smpboot_park_threads(cpu); /* @@ -389,12 +719,12 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) /* * So now all preempt/rcu users must observe !cpu_active(). */ - err = stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); + err = stop_machine(take_cpu_down, NULL, cpumask_of(cpu)); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ - cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); + cpu_notify_nofail(CPU_DOWN_FAILED, cpu); irq_unlock_sparse(); - goto out_release; + return err; } BUG_ON(cpu_online(cpu)); @@ -405,10 +735,8 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) * * Wait for the stop thread to go away. */ - while (!per_cpu(cpu_dead_idle, cpu)) - cpu_relax(); - smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */ - per_cpu(cpu_dead_idle, cpu) = false; + wait_for_completion(&st->done); + BUG_ON(st->state != CPUHP_AP_IDLE_DEAD); /* Interrupts are moved away from the dying cpu, reenable alloc/free */ irq_unlock_sparse(); @@ -417,20 +745,104 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) /* This actually kills the CPU. */ __cpu_die(cpu); - /* CPU is completely dead: tell everyone. Too late to complain. */ tick_cleanup_dead_cpu(cpu); - cpu_notify_nofail(CPU_DEAD | mod, hcpu); - - check_for_tasks(cpu); - -out_release: - cpu_hotplug_done(); - if (!err) - cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu); - return err; + return 0; } -int cpu_down(unsigned int cpu) +static int notify_dead(unsigned int cpu) +{ + cpu_notify_nofail(CPU_DEAD, cpu); + check_for_tasks(cpu); + return 0; +} + +static void cpuhp_complete_idle_dead(void *arg) +{ + struct cpuhp_cpu_state *st = arg; + + complete(&st->done); +} + +void cpuhp_report_idle_dead(void) +{ + struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); + + BUG_ON(st->state != CPUHP_AP_OFFLINE); + rcu_report_dead(smp_processor_id()); + st->state = CPUHP_AP_IDLE_DEAD; + /* + * We cannot call complete after rcu_report_dead() so we delegate it + * to an online cpu. + */ + smp_call_function_single(cpumask_first(cpu_online_mask), + cpuhp_complete_idle_dead, st, 0); +} + +#else +#define notify_down_prepare NULL +#define takedown_cpu NULL +#define notify_dead NULL +#define notify_dying NULL +#endif + +#ifdef CONFIG_HOTPLUG_CPU + +/* Requires cpu_add_remove_lock to be held */ +static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, + enum cpuhp_state target) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int prev_state, ret = 0; + bool hasdied = false; + + if (num_online_cpus() == 1) + return -EBUSY; + + if (!cpu_present(cpu)) + return -EINVAL; + + cpu_hotplug_begin(); + + cpuhp_tasks_frozen = tasks_frozen; + + prev_state = st->state; + st->target = target; + /* + * If the current CPU state is in the range of the AP hotplug thread, + * then we need to kick the thread. + */ + if (st->state > CPUHP_TEARDOWN_CPU) { + ret = cpuhp_kick_ap_work(cpu); + /* + * The AP side has done the error rollback already. Just + * return the error code.. + */ + if (ret) + goto out; + + /* + * We might have stopped still in the range of the AP hotplug + * thread. Nothing to do anymore. + */ + if (st->state > CPUHP_TEARDOWN_CPU) + goto out; + } + /* + * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need + * to do the further cleanups. + */ + ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target); + + hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE; +out: + cpu_hotplug_done(); + /* This post dead nonsense must die */ + if (!ret && hasdied) + cpu_notify_nofail(CPU_POST_DEAD, cpu); + return ret; +} + +static int do_cpu_down(unsigned int cpu, enum cpuhp_state target) { int err; @@ -441,100 +853,131 @@ int cpu_down(unsigned int cpu) goto out; } - err = _cpu_down(cpu, 0); + err = _cpu_down(cpu, 0, target); out: cpu_maps_update_done(); return err; } +int cpu_down(unsigned int cpu) +{ + return do_cpu_down(cpu, CPUHP_OFFLINE); +} EXPORT_SYMBOL(cpu_down); #endif /*CONFIG_HOTPLUG_CPU*/ -/* - * Unpark per-CPU smpboot kthreads at CPU-online time. +/** + * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers + * @cpu: cpu that just started + * + * This function calls the cpu_chain notifiers with CPU_STARTING. + * It must be called by the arch code on the new cpu, before the new cpu + * enables interrupts and before the "boot" cpu returns from __cpu_up(). */ -static int smpboot_thread_call(struct notifier_block *nfb, - unsigned long action, void *hcpu) +void notify_cpu_starting(unsigned int cpu) { - int cpu = (long)hcpu; + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); - switch (action & ~CPU_TASKS_FROZEN) { + while (st->state < target) { + struct cpuhp_step *step; - case CPU_DOWN_FAILED: - case CPU_ONLINE: - smpboot_unpark_threads(cpu); - break; - - default: - break; + st->state++; + step = cpuhp_ap_states + st->state; + cpuhp_invoke_callback(cpu, st->state, step->startup); } - - return NOTIFY_OK; } -static struct notifier_block smpboot_thread_notifier = { - .notifier_call = smpboot_thread_call, - .priority = CPU_PRI_SMPBOOT, -}; - -void smpboot_thread_init(void) +/* + * Called from the idle task. We need to set active here, so we can kick off + * the stopper thread and unpark the smpboot threads. If the target state is + * beyond CPUHP_AP_ONLINE_IDLE we kick cpuhp thread and let it bring up the + * cpu further. + */ +void cpuhp_online_idle(enum cpuhp_state state) { - register_cpu_notifier(&smpboot_thread_notifier); + struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); + unsigned int cpu = smp_processor_id(); + + /* Happens for the boot cpu */ + if (state != CPUHP_AP_ONLINE_IDLE) + return; + + st->state = CPUHP_AP_ONLINE_IDLE; + + /* The cpu is marked online, set it active now */ + set_cpu_active(cpu, true); + /* Unpark the stopper thread and the hotplug thread of this cpu */ + stop_machine_unpark(cpu); + kthread_unpark(st->thread); + + /* Should we go further up ? */ + if (st->target > CPUHP_AP_ONLINE_IDLE) + __cpuhp_kick_ap_work(st); + else + complete(&st->done); } /* Requires cpu_add_remove_lock to be held */ -static int _cpu_up(unsigned int cpu, int tasks_frozen) +static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) { - int ret, nr_calls = 0; - void *hcpu = (void *)(long)cpu; - unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); struct task_struct *idle; + int ret = 0; cpu_hotplug_begin(); - if (cpu_online(cpu) || !cpu_present(cpu)) { + if (!cpu_present(cpu)) { ret = -EINVAL; goto out; } - idle = idle_thread_get(cpu); - if (IS_ERR(idle)) { - ret = PTR_ERR(idle); - goto out; - } - - ret = smpboot_create_threads(cpu); - if (ret) + /* + * The caller of do_cpu_up might have raced with another + * caller. Ignore it for now. + */ + if (st->state >= target) goto out; - ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); - if (ret) { - nr_calls--; - pr_warn("%s: attempt to bring up CPU %u failed\n", - __func__, cpu); - goto out_notify; + if (st->state == CPUHP_OFFLINE) { + /* Let it fail before we try to bring the cpu up */ + idle = idle_thread_get(cpu); + if (IS_ERR(idle)) { + ret = PTR_ERR(idle); + goto out; + } } - /* Arch-specific enabling code. */ - ret = __cpu_up(cpu, idle); + cpuhp_tasks_frozen = tasks_frozen; - if (ret != 0) - goto out_notify; - BUG_ON(!cpu_online(cpu)); + st->target = target; + /* + * If the current CPU state is in the range of the AP hotplug thread, + * then we need to kick the thread once more. + */ + if (st->state > CPUHP_BRINGUP_CPU) { + ret = cpuhp_kick_ap_work(cpu); + /* + * The AP side has done the error rollback already. Just + * return the error code.. + */ + if (ret) + goto out; + } - /* Now call notifier in preparation. */ - cpu_notify(CPU_ONLINE | mod, hcpu); - -out_notify: - if (ret != 0) - __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); + /* + * Try to reach the target state. We max out on the BP at + * CPUHP_BRINGUP_CPU. After that the AP hotplug thread is + * responsible for bringing it up to the target state. + */ + target = min((int)target, CPUHP_BRINGUP_CPU); + ret = cpuhp_up_callbacks(cpu, st, cpuhp_bp_states, target); out: cpu_hotplug_done(); - return ret; } -int cpu_up(unsigned int cpu) +static int do_cpu_up(unsigned int cpu, enum cpuhp_state target) { int err = 0; @@ -558,12 +1001,16 @@ int cpu_up(unsigned int cpu) goto out; } - err = _cpu_up(cpu, 0); - + err = _cpu_up(cpu, 0, target); out: cpu_maps_update_done(); return err; } + +int cpu_up(unsigned int cpu) +{ + return do_cpu_up(cpu, CPUHP_ONLINE); +} EXPORT_SYMBOL_GPL(cpu_up); #ifdef CONFIG_PM_SLEEP_SMP @@ -586,7 +1033,7 @@ int disable_nonboot_cpus(void) if (cpu == first_cpu) continue; trace_suspend_resume(TPS("CPU_OFF"), cpu, true); - error = _cpu_down(cpu, 1); + error = _cpu_down(cpu, 1, CPUHP_OFFLINE); trace_suspend_resume(TPS("CPU_OFF"), cpu, false); if (!error) cpumask_set_cpu(cpu, frozen_cpus); @@ -636,7 +1083,7 @@ void enable_nonboot_cpus(void) for_each_cpu(cpu, frozen_cpus) { trace_suspend_resume(TPS("CPU_ON"), cpu, true); - error = _cpu_up(cpu, 1); + error = _cpu_up(cpu, 1, CPUHP_ONLINE); trace_suspend_resume(TPS("CPU_ON"), cpu, false); if (!error) { pr_info("CPU%d is up\n", cpu); @@ -709,26 +1156,463 @@ core_initcall(cpu_hotplug_pm_sync_init); #endif /* CONFIG_PM_SLEEP_SMP */ -/** - * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers - * @cpu: cpu that just started - * - * This function calls the cpu_chain notifiers with CPU_STARTING. - * It must be called by the arch code on the new cpu, before the new cpu - * enables interrupts and before the "boot" cpu returns from __cpu_up(). - */ -void notify_cpu_starting(unsigned int cpu) -{ - unsigned long val = CPU_STARTING; +#endif /* CONFIG_SMP */ -#ifdef CONFIG_PM_SLEEP_SMP - if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus)) - val = CPU_STARTING_FROZEN; -#endif /* CONFIG_PM_SLEEP_SMP */ - cpu_notify(val, (void *)(long)cpu); +/* Boot processor state steps */ +static struct cpuhp_step cpuhp_bp_states[] = { + [CPUHP_OFFLINE] = { + .name = "offline", + .startup = NULL, + .teardown = NULL, + }, +#ifdef CONFIG_SMP + [CPUHP_CREATE_THREADS]= { + .name = "threads:create", + .startup = smpboot_create_threads, + .teardown = NULL, + .cant_stop = true, + }, + /* + * Preparatory and dead notifiers. Will be replaced once the notifiers + * are converted to states. + */ + [CPUHP_NOTIFY_PREPARE] = { + .name = "notify:prepare", + .startup = notify_prepare, + .teardown = notify_dead, + .skip_onerr = true, + .cant_stop = true, + }, + /* Kicks the plugged cpu into life */ + [CPUHP_BRINGUP_CPU] = { + .name = "cpu:bringup", + .startup = bringup_cpu, + .teardown = NULL, + .cant_stop = true, + }, + /* + * Handled on controll processor until the plugged processor manages + * this itself. + */ + [CPUHP_TEARDOWN_CPU] = { + .name = "cpu:teardown", + .startup = NULL, + .teardown = takedown_cpu, + .cant_stop = true, + }, +#endif +}; + +/* Application processor state steps */ +static struct cpuhp_step cpuhp_ap_states[] = { +#ifdef CONFIG_SMP + /* Final state before CPU kills itself */ + [CPUHP_AP_IDLE_DEAD] = { + .name = "idle:dead", + }, + /* + * Last state before CPU enters the idle loop to die. Transient state + * for synchronization. + */ + [CPUHP_AP_OFFLINE] = { + .name = "ap:offline", + .cant_stop = true, + }, + /* + * Low level startup/teardown notifiers. Run with interrupts + * disabled. Will be removed once the notifiers are converted to + * states. + */ + [CPUHP_AP_NOTIFY_STARTING] = { + .name = "notify:starting", + .startup = notify_starting, + .teardown = notify_dying, + .skip_onerr = true, + .cant_stop = true, + }, + /* Entry state on starting. Interrupts enabled from here on. Transient + * state for synchronsization */ + [CPUHP_AP_ONLINE] = { + .name = "ap:online", + }, + /* Handle smpboot threads park/unpark */ + [CPUHP_AP_SMPBOOT_THREADS] = { + .name = "smpboot:threads", + .startup = smpboot_unpark_threads, + .teardown = NULL, + }, + /* + * Online/down_prepare notifiers. Will be removed once the notifiers + * are converted to states. + */ + [CPUHP_AP_NOTIFY_ONLINE] = { + .name = "notify:online", + .startup = notify_online, + .teardown = notify_down_prepare, + }, +#endif + /* + * The dynamically registered state space is here + */ + + /* CPU is fully up and running. */ + [CPUHP_ONLINE] = { + .name = "online", + .startup = NULL, + .teardown = NULL, + }, +}; + +/* Sanity check for callbacks */ +static int cpuhp_cb_check(enum cpuhp_state state) +{ + if (state <= CPUHP_OFFLINE || state >= CPUHP_ONLINE) + return -EINVAL; + return 0; } -#endif /* CONFIG_SMP */ +static bool cpuhp_is_ap_state(enum cpuhp_state state) +{ + /* + * The extra check for CPUHP_TEARDOWN_CPU is only for documentation + * purposes as that state is handled explicitely in cpu_down. + */ + return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; +} + +static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) +{ + struct cpuhp_step *sp; + + sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states; + return sp + state; +} + +static void cpuhp_store_callbacks(enum cpuhp_state state, + const char *name, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu)) +{ + /* (Un)Install the callbacks for further cpu hotplug operations */ + struct cpuhp_step *sp; + + mutex_lock(&cpuhp_state_mutex); + sp = cpuhp_get_step(state); + sp->startup = startup; + sp->teardown = teardown; + sp->name = name; + mutex_unlock(&cpuhp_state_mutex); +} + +static void *cpuhp_get_teardown_cb(enum cpuhp_state state) +{ + return cpuhp_get_step(state)->teardown; +} + +/* + * Call the startup/teardown function for a step either on the AP or + * on the current CPU. + */ +static int cpuhp_issue_call(int cpu, enum cpuhp_state state, + int (*cb)(unsigned int), bool bringup) +{ + int ret; + + if (!cb) + return 0; + /* + * The non AP bound callbacks can fail on bringup. On teardown + * e.g. module removal we crash for now. + */ +#ifdef CONFIG_SMP + if (cpuhp_is_ap_state(state)) + ret = cpuhp_invoke_ap_callback(cpu, state, cb); + else + ret = cpuhp_invoke_callback(cpu, state, cb); +#else + ret = cpuhp_invoke_callback(cpu, state, cb); +#endif + BUG_ON(ret && !bringup); + return ret; +} + +/* + * Called from __cpuhp_setup_state on a recoverable failure. + * + * Note: The teardown callbacks for rollback are not allowed to fail! + */ +static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state, + int (*teardown)(unsigned int cpu)) +{ + int cpu; + + if (!teardown) + return; + + /* Roll back the already executed steps on the other cpus */ + for_each_present_cpu(cpu) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int cpustate = st->state; + + if (cpu >= failedcpu) + break; + + /* Did we invoke the startup call on that cpu ? */ + if (cpustate >= state) + cpuhp_issue_call(cpu, state, teardown, false); + } +} + +/* + * Returns a free for dynamic slot assignment of the Online state. The states + * are protected by the cpuhp_slot_states mutex and an empty slot is identified + * by having no name assigned. + */ +static int cpuhp_reserve_state(enum cpuhp_state state) +{ + enum cpuhp_state i; + + mutex_lock(&cpuhp_state_mutex); + for (i = CPUHP_AP_ONLINE_DYN; i <= CPUHP_AP_ONLINE_DYN_END; i++) { + if (cpuhp_ap_states[i].name) + continue; + + cpuhp_ap_states[i].name = "Reserved"; + mutex_unlock(&cpuhp_state_mutex); + return i; + } + mutex_unlock(&cpuhp_state_mutex); + WARN(1, "No more dynamic states available for CPU hotplug\n"); + return -ENOSPC; +} + +/** + * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state + * @state: The state to setup + * @invoke: If true, the startup function is invoked for cpus where + * cpu state >= @state + * @startup: startup callback function + * @teardown: teardown callback function + * + * Returns 0 if successful, otherwise a proper error code + */ +int __cpuhp_setup_state(enum cpuhp_state state, + const char *name, bool invoke, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu)) +{ + int cpu, ret = 0; + int dyn_state = 0; + + if (cpuhp_cb_check(state) || !name) + return -EINVAL; + + get_online_cpus(); + + /* currently assignments for the ONLINE state are possible */ + if (state == CPUHP_AP_ONLINE_DYN) { + dyn_state = 1; + ret = cpuhp_reserve_state(state); + if (ret < 0) + goto out; + state = ret; + } + + cpuhp_store_callbacks(state, name, startup, teardown); + + if (!invoke || !startup) + goto out; + + /* + * Try to call the startup callback for each present cpu + * depending on the hotplug state of the cpu. + */ + for_each_present_cpu(cpu) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int cpustate = st->state; + + if (cpustate < state) + continue; + + ret = cpuhp_issue_call(cpu, state, startup, true); + if (ret) { + cpuhp_rollback_install(cpu, state, teardown); + cpuhp_store_callbacks(state, NULL, NULL, NULL); + goto out; + } + } +out: + put_online_cpus(); + if (!ret && dyn_state) + return state; + return ret; +} +EXPORT_SYMBOL(__cpuhp_setup_state); + +/** + * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state + * @state: The state to remove + * @invoke: If true, the teardown function is invoked for cpus where + * cpu state >= @state + * + * The teardown callback is currently not allowed to fail. Think + * about module removal! + */ +void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) +{ + int (*teardown)(unsigned int cpu) = cpuhp_get_teardown_cb(state); + int cpu; + + BUG_ON(cpuhp_cb_check(state)); + + get_online_cpus(); + + if (!invoke || !teardown) + goto remove; + + /* + * Call the teardown callback for each present cpu depending + * on the hotplug state of the cpu. This function is not + * allowed to fail currently! + */ + for_each_present_cpu(cpu) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int cpustate = st->state; + + if (cpustate >= state) + cpuhp_issue_call(cpu, state, teardown, false); + } +remove: + cpuhp_store_callbacks(state, NULL, NULL, NULL); + put_online_cpus(); +} +EXPORT_SYMBOL(__cpuhp_remove_state); + +#if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU) +static ssize_t show_cpuhp_state(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + + return sprintf(buf, "%d\n", st->state); +} +static DEVICE_ATTR(state, 0444, show_cpuhp_state, NULL); + +static ssize_t write_cpuhp_target(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + struct cpuhp_step *sp; + int target, ret; + + ret = kstrtoint(buf, 10, &target); + if (ret) + return ret; + +#ifdef CONFIG_CPU_HOTPLUG_STATE_CONTROL + if (target < CPUHP_OFFLINE || target > CPUHP_ONLINE) + return -EINVAL; +#else + if (target != CPUHP_OFFLINE && target != CPUHP_ONLINE) + return -EINVAL; +#endif + + ret = lock_device_hotplug_sysfs(); + if (ret) + return ret; + + mutex_lock(&cpuhp_state_mutex); + sp = cpuhp_get_step(target); + ret = !sp->name || sp->cant_stop ? -EINVAL : 0; + mutex_unlock(&cpuhp_state_mutex); + if (ret) + return ret; + + if (st->state < target) + ret = do_cpu_up(dev->id, target); + else + ret = do_cpu_down(dev->id, target); + + unlock_device_hotplug(); + return ret ? ret : count; +} + +static ssize_t show_cpuhp_target(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + + return sprintf(buf, "%d\n", st->target); +} +static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target); + +static struct attribute *cpuhp_cpu_attrs[] = { + &dev_attr_state.attr, + &dev_attr_target.attr, + NULL +}; + +static struct attribute_group cpuhp_cpu_attr_group = { + .attrs = cpuhp_cpu_attrs, + .name = "hotplug", + NULL +}; + +static ssize_t show_cpuhp_states(struct device *dev, + struct device_attribute *attr, char *buf) +{ + ssize_t cur, res = 0; + int i; + + mutex_lock(&cpuhp_state_mutex); + for (i = CPUHP_OFFLINE; i <= CPUHP_ONLINE; i++) { + struct cpuhp_step *sp = cpuhp_get_step(i); + + if (sp->name) { + cur = sprintf(buf, "%3d: %s\n", i, sp->name); + buf += cur; + res += cur; + } + } + mutex_unlock(&cpuhp_state_mutex); + return res; +} +static DEVICE_ATTR(states, 0444, show_cpuhp_states, NULL); + +static struct attribute *cpuhp_cpu_root_attrs[] = { + &dev_attr_states.attr, + NULL +}; + +static struct attribute_group cpuhp_cpu_root_attr_group = { + .attrs = cpuhp_cpu_root_attrs, + .name = "hotplug", + NULL +}; + +static int __init cpuhp_sysfs_init(void) +{ + int cpu, ret; + + ret = sysfs_create_group(&cpu_subsys.dev_root->kobj, + &cpuhp_cpu_root_attr_group); + if (ret) + return ret; + + for_each_possible_cpu(cpu) { + struct device *dev = get_cpu_device(cpu); + + if (!dev) + continue; + ret = sysfs_create_group(&dev->kobj, &cpuhp_cpu_attr_group); + if (ret) + return ret; + } + return 0; +} +device_initcall(cpuhp_sysfs_init); +#endif /* * cpu_bit_bitmap[] is a special, "compressed" data structure that @@ -789,3 +1673,25 @@ void init_cpu_online(const struct cpumask *src) { cpumask_copy(&__cpu_online_mask, src); } + +/* + * Activate the first processor. + */ +void __init boot_cpu_init(void) +{ + int cpu = smp_processor_id(); + + /* Mark the boot cpu "present", "online" etc for SMP and UP case */ + set_cpu_online(cpu, true); + set_cpu_active(cpu, true); + set_cpu_present(cpu, true); + set_cpu_possible(cpu, true); +} + +/* + * Must be called _AFTER_ setting up the per_cpu areas + */ +void __init boot_cpu_state_init(void) +{ + per_cpu_ptr(&cpuhp_state, smp_processor_id())->state = CPUHP_ONLINE; +} diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 0167679182c0..5f6ce931f1ea 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1178,6 +1178,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) goto free_area; area->xol_mapping.name = "[uprobes]"; + area->xol_mapping.fault = NULL; area->xol_mapping.pages = area->pages; area->pages[0] = alloc_page(GFP_HIGHUSER); if (!area->pages[0]) diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 3b48dab80164..3bbfd6a9c475 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -64,6 +64,10 @@ config IRQ_DOMAIN_HIERARCHY bool select IRQ_DOMAIN +# Generic IRQ IPI support +config GENERIC_IRQ_IPI + bool + # Generic MSI interrupt support config GENERIC_MSI_IRQ bool diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 2fc9cbdf35b6..2ee42e95a3ce 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -8,3 +8,4 @@ obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o obj-$(CONFIG_PM_SLEEP) += pm.o obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o +obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 5797909f4e5b..2f9f2b0e79f2 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -961,6 +961,7 @@ void irq_chip_mask_parent(struct irq_data *data) data = data->parent_data; data->chip->irq_mask(data); } +EXPORT_SYMBOL_GPL(irq_chip_mask_parent); /** * irq_chip_unmask_parent - Unmask the parent interrupt @@ -971,6 +972,7 @@ void irq_chip_unmask_parent(struct irq_data *data) data = data->parent_data; data->chip->irq_unmask(data); } +EXPORT_SYMBOL_GPL(irq_chip_unmask_parent); /** * irq_chip_eoi_parent - Invoke EOI on the parent interrupt @@ -981,6 +983,7 @@ void irq_chip_eoi_parent(struct irq_data *data) data = data->parent_data; data->chip->irq_eoi(data); } +EXPORT_SYMBOL_GPL(irq_chip_eoi_parent); /** * irq_chip_set_affinity_parent - Set affinity on the parent interrupt @@ -1016,6 +1019,7 @@ int irq_chip_set_type_parent(struct irq_data *data, unsigned int type) return -ENOSYS; } +EXPORT_SYMBOL_GPL(irq_chip_set_type_parent); /** * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 57bff7857e87..a15b5485b446 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -136,10 +136,9 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) { irqreturn_t retval = IRQ_NONE; unsigned int flags = 0, irq = desc->irq_data.irq; - struct irqaction *action = desc->action; + struct irqaction *action; - /* action might have become NULL since we dropped the lock */ - while (action) { + for_each_action_of_desc(desc, action) { irqreturn_t res; trace_irq_handler_entry(irq, action); @@ -173,7 +172,6 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) } retval |= res; - action = action->next; } add_interrupt_randomness(irq, flags); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index fcab63c66905..09be2c903c6d 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -131,6 +131,9 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc) #define IRQ_GET_DESC_CHECK_GLOBAL (_IRQ_DESC_CHECK) #define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU) +#define for_each_action_of_desc(desc, act) \ + for (act = desc->act; act; act = act->next) + struct irq_desc * __irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, unsigned int check); @@ -160,6 +163,8 @@ irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags) __irq_put_desc_unlock(desc, flags, false); } +#define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) + /* * Manipulation functions for irq_data.state */ @@ -188,6 +193,8 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) return __irqd_to_state(d) & mask; } +#undef __irqd_to_state + static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) { __this_cpu_inc(*desc->kstat_irqs); diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c new file mode 100644 index 000000000000..c37f34b00a11 --- /dev/null +++ b/kernel/irq/ipi.c @@ -0,0 +1,326 @@ +/* + * linux/kernel/irq/ipi.c + * + * Copyright (C) 2015 Imagination Technologies Ltd + * Author: Qais Yousef + * + * This file contains driver APIs to the IPI subsystem. + */ + +#define pr_fmt(fmt) "genirq/ipi: " fmt + +#include +#include + +/** + * irq_reserve_ipi() - Setup an IPI to destination cpumask + * @domain: IPI domain + * @dest: cpumask of cpus which can receive the IPI + * + * Allocate a virq that can be used to send IPI to any CPU in dest mask. + * + * On success it'll return linux irq number and 0 on failure + */ +unsigned int irq_reserve_ipi(struct irq_domain *domain, + const struct cpumask *dest) +{ + unsigned int nr_irqs, offset; + struct irq_data *data; + int virq, i; + + if (!domain ||!irq_domain_is_ipi(domain)) { + pr_warn("Reservation on a non IPI domain\n"); + return 0; + } + + if (!cpumask_subset(dest, cpu_possible_mask)) { + pr_warn("Reservation is not in possible_cpu_mask\n"); + return 0; + } + + nr_irqs = cpumask_weight(dest); + if (!nr_irqs) { + pr_warn("Reservation for empty destination mask\n"); + return 0; + } + + if (irq_domain_is_ipi_single(domain)) { + /* + * If the underlying implementation uses a single HW irq on + * all cpus then we only need a single Linux irq number for + * it. We have no restrictions vs. the destination mask. The + * underlying implementation can deal with holes nicely. + */ + nr_irqs = 1; + offset = 0; + } else { + unsigned int next; + + /* + * The IPI requires a seperate HW irq on each CPU. We require + * that the destination mask is consecutive. If an + * implementation needs to support holes, it can reserve + * several IPI ranges. + */ + offset = cpumask_first(dest); + /* + * Find a hole and if found look for another set bit after the + * hole. For now we don't support this scenario. + */ + next = cpumask_next_zero(offset, dest); + if (next < nr_cpu_ids) + next = cpumask_next(next, dest); + if (next < nr_cpu_ids) { + pr_warn("Destination mask has holes\n"); + return 0; + } + } + + virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE); + if (virq <= 0) { + pr_warn("Can't reserve IPI, failed to alloc descs\n"); + return 0; + } + + virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE, + (void *) dest, true); + + if (virq <= 0) { + pr_warn("Can't reserve IPI, failed to alloc hw irqs\n"); + goto free_descs; + } + + for (i = 0; i < nr_irqs; i++) { + data = irq_get_irq_data(virq + i); + cpumask_copy(data->common->affinity, dest); + data->common->ipi_offset = offset; + } + return virq; + +free_descs: + irq_free_descs(virq, nr_irqs); + return 0; +} + +/** + * irq_destroy_ipi() - unreserve an IPI that was previously allocated + * @irq: linux irq number to be destroyed + * + * Return the IPIs allocated with irq_reserve_ipi() to the system destroying + * all virqs associated with them. + */ +void irq_destroy_ipi(unsigned int irq) +{ + struct irq_data *data = irq_get_irq_data(irq); + struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL; + struct irq_domain *domain; + unsigned int nr_irqs; + + if (!irq || !data || !ipimask) + return; + + domain = data->domain; + if (WARN_ON(domain == NULL)) + return; + + if (!irq_domain_is_ipi(domain)) { + pr_warn("Trying to destroy a non IPI domain!\n"); + return; + } + + if (irq_domain_is_ipi_per_cpu(domain)) + nr_irqs = cpumask_weight(ipimask); + else + nr_irqs = 1; + + irq_domain_free_irqs(irq, nr_irqs); +} + +/** + * ipi_get_hwirq - Get the hwirq associated with an IPI to a cpu + * @irq: linux irq number + * @cpu: the target cpu + * + * When dealing with coprocessors IPI, we need to inform the coprocessor of + * the hwirq it needs to use to receive and send IPIs. + * + * Returns hwirq value on success and INVALID_HWIRQ on failure. + */ +irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu) +{ + struct irq_data *data = irq_get_irq_data(irq); + struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL; + + if (!data || !ipimask || cpu > nr_cpu_ids) + return INVALID_HWIRQ; + + if (!cpumask_test_cpu(cpu, ipimask)) + return INVALID_HWIRQ; + + /* + * Get the real hardware irq number if the underlying implementation + * uses a seperate irq per cpu. If the underlying implementation uses + * a single hardware irq for all cpus then the IPI send mechanism + * needs to take care of the cpu destinations. + */ + if (irq_domain_is_ipi_per_cpu(data->domain)) + data = irq_get_irq_data(irq + cpu - data->common->ipi_offset); + + return data ? irqd_to_hwirq(data) : INVALID_HWIRQ; +} +EXPORT_SYMBOL_GPL(ipi_get_hwirq); + +static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data, + const struct cpumask *dest, unsigned int cpu) +{ + struct cpumask *ipimask = irq_data_get_affinity_mask(data); + + if (!chip || !ipimask) + return -EINVAL; + + if (!chip->ipi_send_single && !chip->ipi_send_mask) + return -EINVAL; + + if (cpu > nr_cpu_ids) + return -EINVAL; + + if (dest) { + if (!cpumask_subset(dest, ipimask)) + return -EINVAL; + } else { + if (!cpumask_test_cpu(cpu, ipimask)) + return -EINVAL; + } + return 0; +} + +/** + * __ipi_send_single - send an IPI to a target Linux SMP CPU + * @desc: pointer to irq_desc of the IRQ + * @cpu: destination CPU, must in the destination mask passed to + * irq_reserve_ipi() + * + * This function is for architecture or core code to speed up IPI sending. Not + * usable from driver code. + * + * Returns zero on success and negative error number on failure. + */ +int __ipi_send_single(struct irq_desc *desc, unsigned int cpu) +{ + struct irq_data *data = irq_desc_get_irq_data(desc); + struct irq_chip *chip = irq_data_get_irq_chip(data); + +#ifdef DEBUG + /* + * Minimise the overhead by omitting the checks for Linux SMP IPIs. + * Since the callers should be arch or core code which is generally + * trusted, only check for errors when debugging. + */ + if (WARN_ON_ONCE(ipi_send_verify(chip, data, NULL, cpu))) + return -EINVAL; +#endif + if (!chip->ipi_send_single) { + chip->ipi_send_mask(data, cpumask_of(cpu)); + return 0; + } + + /* FIXME: Store this information in irqdata flags */ + if (irq_domain_is_ipi_per_cpu(data->domain) && + cpu != data->common->ipi_offset) { + /* use the correct data for that cpu */ + unsigned irq = data->irq + cpu - data->common->ipi_offset; + + data = irq_get_irq_data(irq); + } + chip->ipi_send_single(data, cpu); + return 0; +} + +/** + * ipi_send_mask - send an IPI to target Linux SMP CPU(s) + * @desc: pointer to irq_desc of the IRQ + * @dest: dest CPU(s), must be a subset of the mask passed to + * irq_reserve_ipi() + * + * This function is for architecture or core code to speed up IPI sending. Not + * usable from driver code. + * + * Returns zero on success and negative error number on failure. + */ +int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest) +{ + struct irq_data *data = irq_desc_get_irq_data(desc); + struct irq_chip *chip = irq_data_get_irq_chip(data); + unsigned int cpu; + +#ifdef DEBUG + /* + * Minimise the overhead by omitting the checks for Linux SMP IPIs. + * Since the callers should be arch or core code which is generally + * trusted, only check for errors when debugging. + */ + if (WARN_ON_ONCE(ipi_send_verify(chip, data, dest, 0))) + return -EINVAL; +#endif + if (chip->ipi_send_mask) { + chip->ipi_send_mask(data, dest); + return 0; + } + + if (irq_domain_is_ipi_per_cpu(data->domain)) { + unsigned int base = data->irq; + + for_each_cpu(cpu, dest) { + unsigned irq = base + cpu - data->common->ipi_offset; + + data = irq_get_irq_data(irq); + chip->ipi_send_single(data, cpu); + } + } else { + for_each_cpu(cpu, dest) + chip->ipi_send_single(data, cpu); + } + return 0; +} + +/** + * ipi_send_single - Send an IPI to a single CPU + * @virq: linux irq number from irq_reserve_ipi() + * @cpu: destination CPU, must in the destination mask passed to + * irq_reserve_ipi() + * + * Returns zero on success and negative error number on failure. + */ +int ipi_send_single(unsigned int virq, unsigned int cpu) +{ + struct irq_desc *desc = irq_to_desc(virq); + struct irq_data *data = desc ? irq_desc_get_irq_data(desc) : NULL; + struct irq_chip *chip = data ? irq_data_get_irq_chip(data) : NULL; + + if (WARN_ON_ONCE(ipi_send_verify(chip, data, NULL, cpu))) + return -EINVAL; + + return __ipi_send_single(desc, cpu); +} +EXPORT_SYMBOL_GPL(ipi_send_single); + +/** + * ipi_send_mask - Send an IPI to target CPU(s) + * @virq: linux irq number from irq_reserve_ipi() + * @dest: dest CPU(s), must be a subset of the mask passed to + * irq_reserve_ipi() + * + * Returns zero on success and negative error number on failure. + */ +int ipi_send_mask(unsigned int virq, const struct cpumask *dest) +{ + struct irq_desc *desc = irq_to_desc(virq); + struct irq_data *data = desc ? irq_desc_get_irq_data(desc) : NULL; + struct irq_chip *chip = data ? irq_data_get_irq_chip(data) : NULL; + + if (WARN_ON_ONCE(ipi_send_verify(chip, data, dest, 0))) + return -EINVAL; + + return __ipi_send_mask(desc, dest); +} +EXPORT_SYMBOL_GPL(ipi_send_mask); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 0409da0bcc33..0ccd028817d7 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -24,10 +24,27 @@ static struct lock_class_key irq_desc_lock_class; #if defined(CONFIG_SMP) +static int __init irq_affinity_setup(char *str) +{ + zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); + cpulist_parse(str, irq_default_affinity); + /* + * Set at least the boot cpu. We don't want to end up with + * bugreports caused by random comandline masks + */ + cpumask_set_cpu(smp_processor_id(), irq_default_affinity); + return 1; +} +__setup("irqaffinity=", irq_affinity_setup); + static void __init init_irq_default_affinity(void) { - alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); - cpumask_setall(irq_default_affinity); +#ifdef CONFIG_CPUMASK_OFFSTACK + if (!irq_default_affinity) + zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); +#endif + if (cpumask_empty(irq_default_affinity)) + cpumask_setall(irq_default_affinity); } #else static void __init init_irq_default_affinity(void) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 3e56d2f03e24..3a519a01118b 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -23,8 +23,6 @@ static DEFINE_MUTEX(irq_domain_mutex); static DEFINE_MUTEX(revmap_trees_mutex); static struct irq_domain *irq_default_domain; -static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, - irq_hw_number_t hwirq, int node); static void irq_domain_check_hierarchy(struct irq_domain *domain); struct irqchip_fwid { @@ -840,8 +838,8 @@ const struct irq_domain_ops irq_domain_simple_ops = { }; EXPORT_SYMBOL_GPL(irq_domain_simple_ops); -static int irq_domain_alloc_descs(int virq, unsigned int cnt, - irq_hw_number_t hwirq, int node) +int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, + int node) { unsigned int hint; @@ -895,6 +893,7 @@ struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent, return domain; } +EXPORT_SYMBOL_GPL(irq_domain_create_hierarchy); static void irq_domain_insert_irq(int virq) { @@ -1045,6 +1044,7 @@ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq, return 0; } +EXPORT_SYMBOL_GPL(irq_domain_set_hwirq_and_chip); /** * irq_domain_set_info - Set the complete data for a @virq in @domain @@ -1078,6 +1078,7 @@ void irq_domain_reset_irq_data(struct irq_data *irq_data) irq_data->chip = &no_irq_chip; irq_data->chip_data = NULL; } +EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data); /** * irq_domain_free_irqs_common - Clear irq_data and free the parent @@ -1275,6 +1276,7 @@ int irq_domain_alloc_irqs_parent(struct irq_domain *domain, nr_irqs, arg); return -ENOSYS; } +EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent); /** * irq_domain_free_irqs_parent - Free interrupts from parent domain @@ -1292,6 +1294,7 @@ void irq_domain_free_irqs_parent(struct irq_domain *domain, irq_domain_free_irqs_recursive(domain->parent, irq_base, nr_irqs); } +EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent); /** * irq_domain_activate_irq - Call domain_ops->activate recursively to activate diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 841187239adc..3ddd2297ee95 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -144,13 +144,11 @@ int irq_can_set_affinity(unsigned int irq) */ void irq_set_thread_affinity(struct irq_desc *desc) { - struct irqaction *action = desc->action; + struct irqaction *action; - while (action) { + for_each_action_of_desc(desc, action) if (action->thread) set_bit(IRQTF_AFFINITY, &action->thread_flags); - action = action->next; - } } #ifdef CONFIG_GENERIC_PENDING_IRQ @@ -994,7 +992,7 @@ void irq_wake_thread(unsigned int irq, void *dev_id) return; raw_spin_lock_irqsave(&desc->lock, flags); - for (action = desc->action; action; action = action->next) { + for_each_action_of_desc(desc, action) { if (action->dev_id == dev_id) { if (action->thread) __irq_wake_thread(desc, action); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index a2c02fd5d6d0..4e1b94726818 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -291,7 +291,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action) int ret = 1; raw_spin_lock_irqsave(&desc->lock, flags); - for (action = desc->action ; action; action = action->next) { + for_each_action_of_desc(desc, action) { if ((action != new_action) && action->name && !strcmp(new_action->name, action->name)) { ret = 0; diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 32144175458d..5707f97a3e6a 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -211,14 +211,12 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) * desc->lock here. See synchronize_irq(). */ raw_spin_lock_irqsave(&desc->lock, flags); - action = desc->action; - while (action) { + for_each_action_of_desc(desc, action) { printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler); if (action->thread_fn) printk(KERN_CONT " threaded [<%p>] %pf", action->thread_fn, action->thread_fn); printk(KERN_CONT "\n"); - action = action->next; } raw_spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d2988d047d66..65ae0e5c35da 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -932,12 +932,14 @@ rcu_torture_writer(void *arg) int nsynctypes = 0; VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); - pr_alert("%s" TORTURE_FLAG - " Grace periods expedited from boot/sysfs for %s,\n", - torture_type, cur_ops->name); - pr_alert("%s" TORTURE_FLAG - " Testing of dynamic grace-period expediting diabled.\n", - torture_type); + if (!can_expedite) { + pr_alert("%s" TORTURE_FLAG + " Grace periods expedited from boot/sysfs for %s,\n", + torture_type, cur_ops->name); + pr_alert("%s" TORTURE_FLAG + " Disabled dynamic grace-period expediting.\n", + torture_type); + } /* Initialize synctype[] array. If none set, take default. */ if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1) diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index e492a5253e0f..196f0302e2f4 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -23,7 +23,7 @@ */ #include -#include +#include #include #include @@ -122,18 +122,7 @@ free_out: debugfs_remove_recursive(rcudir); return 1; } - -static void __exit rcutiny_trace_cleanup(void) -{ - debugfs_remove_recursive(rcudir); -} - -module_init(rcutiny_trace_init); -module_exit(rcutiny_trace_cleanup); - -MODULE_AUTHOR("Paul E. McKenney"); -MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); -MODULE_LICENSE("GPL"); +device_initcall(rcutiny_trace_init); static void check_cpu_stall(struct rcu_ctrlblk *rcp) { diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9fd5b628a88d..9a535a86e732 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -108,7 +108,6 @@ RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); static struct rcu_state *const rcu_state_p; -static struct rcu_data __percpu *const rcu_data_p; LIST_HEAD(rcu_struct_flavors); /* Dump rcu_node combining tree at boot to verify correct setup. */ @@ -1083,13 +1082,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, rcu_sysidle_check_cpu(rdp, isidle, maxj); if ((rdp->dynticks_snap & 0x1) == 0) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); - return 1; - } else { if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rdp->mynode->gpnum)) WRITE_ONCE(rdp->gpwrap, true); - return 0; + return 1; } + return 0; } /* @@ -1173,15 +1171,16 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, smp_mb(); /* ->cond_resched_completed before *rcrmp. */ WRITE_ONCE(*rcrmp, READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask); - resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ - rdp->rsp->jiffies_resched += 5; /* Enable beating. */ - } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { - /* Time to beat on that CPU again! */ - resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ - rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ } + rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ } + /* And if it has been a really long time, kick the CPU as well. */ + if (ULONG_CMP_GE(jiffies, + rdp->rsp->gp_start + 2 * jiffies_till_sched_qs) || + ULONG_CMP_GE(jiffies, rdp->rsp->gp_start + jiffies_till_sched_qs)) + resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ + return 0; } @@ -1246,7 +1245,7 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) if (rnp->qsmask & (1UL << cpu)) dump_cpu_task(rnp->grplo + cpu); } - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -1266,12 +1265,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) raw_spin_lock_irqsave_rcu_node(rnp, flags); delta = jiffies - READ_ONCE(rsp->jiffies_stall); if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } WRITE_ONCE(rsp->jiffies_stall, jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* * OK, time to rat on our buddy... @@ -1292,7 +1291,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) ndetected++; } } - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } print_cpu_stall_info_end(); @@ -1357,7 +1356,7 @@ static void print_cpu_stall(struct rcu_state *rsp) if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) WRITE_ONCE(rsp->jiffies_stall, jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* * Attempt to revive the RCU machinery by forcing a context switch. @@ -1595,7 +1594,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, } unlock_out: if (rnp != rnp_root) - raw_spin_unlock(&rnp_root->lock); + raw_spin_unlock_rcu_node(rnp_root); out: if (c_out != NULL) *c_out = c; @@ -1814,7 +1813,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) return; } needwake = __note_gp_changes(rsp, rnp, rdp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) rcu_gp_kthread_wake(rsp); } @@ -1839,7 +1838,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) raw_spin_lock_irq_rcu_node(rnp); if (!READ_ONCE(rsp->gp_flags)) { /* Spurious wakeup, tell caller to go back to sleep. */ - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); return false; } WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */ @@ -1849,7 +1848,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) * Grace period already in progress, don't start another. * Not supposed to be able to happen. */ - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); return false; } @@ -1858,7 +1857,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) /* Record GP times before starting GP, hence smp_store_release(). */ smp_store_release(&rsp->gpnum, rsp->gpnum + 1); trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); /* * Apply per-leaf buffered online and offline operations to the @@ -1872,7 +1871,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) if (rnp->qsmaskinit == rnp->qsmaskinitnext && !rnp->wait_blkd_tasks) { /* Nothing to do on this leaf rcu_node structure. */ - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); continue; } @@ -1906,7 +1905,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) rcu_cleanup_dead_rnp(rnp); } - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); } /* @@ -1937,7 +1936,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) trace_rcu_grace_period_init(rsp->name, rnp->gpnum, rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); cond_resched_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); } @@ -1995,7 +1994,7 @@ static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time) raw_spin_lock_irq_rcu_node(rnp); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS); - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); } } @@ -2025,7 +2024,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) * safe for us to drop the lock in order to mark the grace * period as completed in all of the rcu_node structures. */ - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); /* * Propagate new ->completed value to rcu_node structures so @@ -2047,7 +2046,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) /* smp_mb() provided by prior unlock-lock pair. */ nocb += rcu_future_gp_cleanup(rsp, rnp); sq = rcu_nocb_gp_get(rnp); - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); rcu_nocb_gp_cleanup(sq); cond_resched_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); @@ -2070,7 +2069,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) READ_ONCE(rsp->gpnum), TPS("newreq")); } - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); } /* @@ -2236,18 +2235,20 @@ static bool rcu_start_gp(struct rcu_state *rsp) } /* - * Report a full set of quiescent states to the specified rcu_state - * data structure. This involves cleaning up after the prior grace - * period and letting rcu_start_gp() start up the next grace period - * if one is needed. Note that the caller must hold rnp->lock, which - * is released before return. + * Report a full set of quiescent states to the specified rcu_state data + * structure. Invoke rcu_gp_kthread_wake() to awaken the grace-period + * kthread if another grace period is required. Whether we wake + * the grace-period kthread or it awakens itself for the next round + * of quiescent-state forcing, that kthread will clean up after the + * just-completed grace period. Note that the caller must hold rnp->lock, + * which is released before return. */ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) { WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); - raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ } @@ -2277,7 +2278,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, * Our bit has already been cleared, or the * relevant grace period is already over, so done. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ @@ -2289,7 +2290,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { /* Other bits still set at this level, so done. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } mask = rnp->grpmask; @@ -2299,7 +2300,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, break; } - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rnp_c = rnp; rnp = rnp->parent; raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -2331,7 +2332,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; /* Still need more quiescent states! */ } @@ -2348,19 +2349,14 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, /* Report up the rest of the hierarchy, tracking current ->gpnum. */ gps = rnp->gpnum; mask = rnp->grpmask; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */ rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags); } /* * Record a quiescent state for the specified CPU to that CPU's rcu_data - * structure. This must be either called from the specified CPU, or - * called when the specified CPU is known to be offline (and when it is - * also known that no other CPU is concurrently trying to help the offline - * CPU). The lastcomp argument is used to make sure we are still in the - * grace period of interest. We don't want to end the current grace period - * based on quiescent states detected in an earlier grace period! + * structure. This must be called from the specified CPU. */ static void rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) @@ -2385,14 +2381,14 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) */ rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } mask = rdp->grpmask; if ((rnp->qsmask & mask) == 0) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } else { - rdp->core_needs_qs = 0; + rdp->core_needs_qs = false; /* * This GP can't end until cpu checks in, so all of our @@ -2601,35 +2597,14 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) rnp->qsmaskinit &= ~mask; rnp->qsmask &= ~mask; if (rnp->qsmaskinit) { - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp); + /* irqs remain disabled. */ return; } - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ } } -/* - * The CPU is exiting the idle loop into the arch_cpu_idle_dead() - * function. We now remove it from the rcu_node tree's ->qsmaskinit - * bit masks. - */ -static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) -{ - unsigned long flags; - unsigned long mask; - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ - - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return; - - /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ - mask = rdp->grpmask; - raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ - rnp->qsmaskinitnext &= ~mask; - raw_spin_unlock_irqrestore(&rnp->lock, flags); -} - /* * The CPU has been completely removed, and some other CPU is reporting * this fact from process context. Do the remainder of the cleanup, @@ -2861,7 +2836,7 @@ static void force_qs_rnp(struct rcu_state *rsp, rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags); } else { /* Nothing to do here, so just drop the lock. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } } @@ -2897,11 +2872,11 @@ static void force_quiescent_state(struct rcu_state *rsp) raw_spin_unlock(&rnp_old->fqslock); if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { rsp->n_force_qs_lh++; - raw_spin_unlock_irqrestore(&rnp_old->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); return; /* Someone beat us to it. */ } WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); - raw_spin_unlock_irqrestore(&rnp_old->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ } @@ -2927,7 +2902,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) if (cpu_needs_another_gp(rsp, rdp)) { raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */ needwake = rcu_start_gp(rsp); - raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); if (needwake) rcu_gp_kthread_wake(rsp); } else { @@ -3018,7 +2993,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, raw_spin_lock_rcu_node(rnp_root); needwake = rcu_start_gp(rsp); - raw_spin_unlock(&rnp_root->lock); + raw_spin_unlock_rcu_node(rnp_root); if (needwake) rcu_gp_kthread_wake(rsp); } else { @@ -3438,14 +3413,14 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmaskinit == rnp->expmaskinitnext) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); continue; /* No new CPUs, nothing to do. */ } /* Update this node's mask, track old value for propagation. */ oldmask = rnp->expmaskinit; rnp->expmaskinit = rnp->expmaskinitnext; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* If was already nonzero, nothing to propagate. */ if (oldmask) @@ -3460,7 +3435,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) if (rnp_up->expmaskinit) done = true; rnp_up->expmaskinit |= mask; - raw_spin_unlock_irqrestore(&rnp_up->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags); if (done) break; mask = rnp_up->grpmask; @@ -3483,7 +3458,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) raw_spin_lock_irqsave_rcu_node(rnp, flags); WARN_ON_ONCE(rnp->expmask); rnp->expmask = rnp->expmaskinit; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -3524,11 +3499,11 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, if (!rnp->expmask) rcu_initiate_boost(rnp, flags); else - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); break; } if (rnp->parent == NULL) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (wake) { smp_mb(); /* EGP done before wake_up(). */ swake_up(&rsp->expedited_wq); @@ -3536,7 +3511,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, break; } mask = rnp->grpmask; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */ rnp = rnp->parent; raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ WARN_ON_ONCE(!(rnp->expmask & mask)); @@ -3571,7 +3546,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, raw_spin_lock_irqsave_rcu_node(rnp, flags); if (!(rnp->expmask & mask)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } rnp->expmask &= ~mask; @@ -3732,7 +3707,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, */ if (rcu_preempt_has_tasks(rnp)) rnp->exp_tasks = rnp->blkd_tasks.next; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* IPI the remaining CPUs for expedited quiescent state. */ mask = 1; @@ -3749,7 +3724,7 @@ retry_ipi: raw_spin_lock_irqsave_rcu_node(rnp, flags); if (cpu_online(cpu) && (rnp->expmask & mask)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); schedule_timeout_uninterruptible(1); if (cpu_online(cpu) && (rnp->expmask & mask)) @@ -3758,7 +3733,7 @@ retry_ipi: } if (!(rnp->expmask & mask)) mask_ofl_ipi &= ~mask; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* Report quiescent states for those that went offline. */ mask_ofl_test |= mask_ofl_ipi; @@ -4165,7 +4140,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) return; raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */ rnp->qsmaskinit |= mask; - raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* Interrupts remain disabled. */ } } @@ -4189,7 +4164,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->rsp = rsp; mutex_init(&rdp->exp_funnel_mutex); rcu_boot_init_nocb_percpu_data(rdp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* @@ -4217,7 +4192,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rcu_sysidle_init_percpu_data(rdp->dynticks); atomic_set(&rdp->dynticks->dynticks, (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ /* * Add CPU to leaf rcu_node pending-online bitmask. Any needed @@ -4238,7 +4213,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); rdp->core_needs_qs = false; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } static void rcu_prepare_cpu(int cpu) @@ -4249,6 +4224,46 @@ static void rcu_prepare_cpu(int cpu) rcu_init_percpu_data(cpu, rsp); } +#ifdef CONFIG_HOTPLUG_CPU +/* + * The CPU is exiting the idle loop into the arch_cpu_idle_dead() + * function. We now remove it from the rcu_node tree's ->qsmaskinit + * bit masks. + * The CPU is exiting the idle loop into the arch_cpu_idle_dead() + * function. We now remove it from the rcu_node tree's ->qsmaskinit + * bit masks. + */ +static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) +{ + unsigned long flags; + unsigned long mask; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ + + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) + return; + + /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ + mask = rdp->grpmask; + raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ + rnp->qsmaskinitnext &= ~mask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +} + +void rcu_report_dead(unsigned int cpu) +{ + struct rcu_state *rsp; + + /* QS for any half-done expedited RCU-sched GP. */ + preempt_disable(); + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(rcu_sched_state.rda), true); + preempt_enable(); + for_each_rcu_flavor(rsp) + rcu_cleanup_dying_idle_cpu(cpu, rsp); +} +#endif + /* * Handle CPU online/offline notification events. */ @@ -4280,17 +4295,6 @@ int rcu_cpu_notify(struct notifier_block *self, for_each_rcu_flavor(rsp) rcu_cleanup_dying_cpu(rsp); break; - case CPU_DYING_IDLE: - /* QS for any half-done expedited RCU-sched GP. */ - preempt_disable(); - rcu_report_exp_rdp(&rcu_sched_state, - this_cpu_ptr(rcu_sched_state.rda), true); - preempt_enable(); - - for_each_rcu_flavor(rsp) { - rcu_cleanup_dying_idle_cpu(cpu, rsp); - } - break; case CPU_DEAD: case CPU_DEAD_FROZEN: case CPU_UP_CANCELED: @@ -4360,7 +4364,7 @@ static int __init rcu_spawn_gp_kthread(void) sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); } - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); wake_up_process(t); } rcu_spawn_nocb_kthreads(); @@ -4451,8 +4455,8 @@ static void __init rcu_init_one(struct rcu_state *rsp) cpustride *= levelspread[i]; rnp = rsp->level[i]; for (j = 0; j < levelcnt[i]; j++, rnp++) { - raw_spin_lock_init(&rnp->lock); - lockdep_set_class_and_name(&rnp->lock, + raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock)); + lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock), &rcu_node_class[i], buf[i]); raw_spin_lock_init(&rnp->fqslock); lockdep_set_class_and_name(&rnp->fqslock, diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index bbd235d0e71f..df668c0f9e64 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -150,8 +150,9 @@ struct rcu_dynticks { * Definition for node within the RCU grace-period-detection hierarchy. */ struct rcu_node { - raw_spinlock_t lock; /* Root rcu_node's lock protects some */ - /* rcu_state fields as well as following. */ + raw_spinlock_t __private lock; /* Root rcu_node's lock protects */ + /* some rcu_state fields as well as */ + /* following. */ unsigned long gpnum; /* Current grace period for this node. */ /* This will either be equal to or one */ /* behind the root rcu_node's gpnum. */ @@ -682,7 +683,7 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) #endif /* #else #ifdef CONFIG_PPC */ /* - * Wrappers for the rcu_node::lock acquire. + * Wrappers for the rcu_node::lock acquire and release. * * Because the rcu_nodes form a tree, the tree traversal locking will observe * different lock values, this in turn means that an UNLOCK of one level @@ -691,29 +692,48 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) * * In order to restore full ordering between tree levels, augment the regular * lock acquire functions with smp_mb__after_unlock_lock(). + * + * As ->lock of struct rcu_node is a __private field, therefore one should use + * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock. */ static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp) { - raw_spin_lock(&rnp->lock); + raw_spin_lock(&ACCESS_PRIVATE(rnp, lock)); smp_mb__after_unlock_lock(); } +static inline void raw_spin_unlock_rcu_node(struct rcu_node *rnp) +{ + raw_spin_unlock(&ACCESS_PRIVATE(rnp, lock)); +} + static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp) { - raw_spin_lock_irq(&rnp->lock); + raw_spin_lock_irq(&ACCESS_PRIVATE(rnp, lock)); smp_mb__after_unlock_lock(); } -#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \ -do { \ - typecheck(unsigned long, flags); \ - raw_spin_lock_irqsave(&(rnp)->lock, flags); \ - smp_mb__after_unlock_lock(); \ +static inline void raw_spin_unlock_irq_rcu_node(struct rcu_node *rnp) +{ + raw_spin_unlock_irq(&ACCESS_PRIVATE(rnp, lock)); +} + +#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \ +do { \ + typecheck(unsigned long, flags); \ + raw_spin_lock_irqsave(&ACCESS_PRIVATE(rnp, lock), flags); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define raw_spin_unlock_irqrestore_rcu_node(rnp, flags) \ +do { \ + typecheck(unsigned long, flags); \ + raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(rnp, lock), flags); \ } while (0) static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp) { - bool locked = raw_spin_trylock(&rnp->lock); + bool locked = raw_spin_trylock(&ACCESS_PRIVATE(rnp, lock)); if (locked) smp_mb__after_unlock_lock(); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 080bd202d360..efdf7b61ce12 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -235,7 +235,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) rnp->gp_tasks = &t->rcu_node_entry; if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) rnp->exp_tasks = &t->rcu_node_entry; - raw_spin_unlock(&rnp->lock); /* rrupts remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */ /* * Report the quiescent state for the expedited GP. This expedited @@ -489,7 +489,7 @@ void rcu_read_unlock_special(struct task_struct *t) !!rnp->gp_tasks); rcu_report_unblock_qs_rnp(rcu_state_p, rnp, flags); } else { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* Unboost if we were boosted. */ @@ -518,14 +518,14 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) raw_spin_lock_irqsave_rcu_node(rnp, flags); if (!rcu_preempt_blocked_readers_cgp(rnp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } t = list_entry(rnp->gp_tasks->prev, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) sched_show_task(t); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* @@ -807,7 +807,6 @@ void exit_rcu(void) #else /* #ifdef CONFIG_PREEMPT_RCU */ static struct rcu_state *const rcu_state_p = &rcu_sched_state; -static struct rcu_data __percpu *const rcu_data_p = &rcu_sched_data; /* * Tell them what RCU they are running. @@ -991,7 +990,7 @@ static int rcu_boost(struct rcu_node *rnp) * might exit their RCU read-side critical sections on their own. */ if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return 0; } @@ -1028,7 +1027,7 @@ static int rcu_boost(struct rcu_node *rnp) */ t = container_of(tb, struct task_struct, rcu_node_entry); rt_mutex_init_proxy_locked(&rnp->boost_mtx, t); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* Lock only for side effect: boosts task t's priority. */ rt_mutex_lock(&rnp->boost_mtx); rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */ @@ -1088,7 +1087,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { rnp->n_balk_exp_gp_tasks++; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } if (rnp->exp_tasks != NULL || @@ -1098,13 +1097,13 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) ULONG_CMP_GE(jiffies, rnp->boost_time))) { if (rnp->exp_tasks == NULL) rnp->boost_tasks = rnp->gp_tasks; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); t = rnp->boost_kthread_task; if (t) rcu_wake_cond(t, rnp->boost_kthread_status); } else { rcu_initiate_boost_trace(rnp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -1172,7 +1171,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, return PTR_ERR(t); raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->boost_kthread_task = t; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ @@ -1308,7 +1307,7 @@ static void rcu_prepare_kthreads(int cpu) static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } static void invoke_rcu_callbacks_kthread(void) @@ -1559,7 +1558,7 @@ static void rcu_prepare_for_idle(void) rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ needwake = rcu_accelerate_cbs(rsp, rnp, rdp); - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ if (needwake) rcu_gp_kthread_wake(rsp); } @@ -2064,7 +2063,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) raw_spin_lock_irqsave_rcu_node(rnp, flags); needwake = rcu_start_future_gp(rnp, rdp, &c); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) rcu_gp_kthread_wake(rdp->rsp); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 76b94e19430b..ca828b41c938 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -128,6 +128,7 @@ bool rcu_gp_is_normal(void) { return READ_ONCE(rcu_normal); } +EXPORT_SYMBOL_GPL(rcu_gp_is_normal); static atomic_t rcu_expedited_nesting = ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e5725b931bee..ea8f49ae0062 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5434,16 +5434,6 @@ static int sched_cpu_active(struct notifier_block *nfb, set_cpu_rq_start_time(); return NOTIFY_OK; - case CPU_ONLINE: - /* - * At this point a starting CPU has marked itself as online via - * set_cpu_online(). But it might not yet have marked itself - * as active, which is essential from here on. - */ - set_cpu_active(cpu, true); - stop_machine_unpark(cpu); - return NOTIFY_OK; - case CPU_DOWN_FAILED: set_cpu_active(cpu, true); return NOTIFY_OK; diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 544a7133cbd1..bd12c6c714ec 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -193,8 +194,6 @@ exit_idle: rcu_idle_exit(); } -DEFINE_PER_CPU(bool, cpu_dead_idle); - /* * Generic idle loop implementation * @@ -221,10 +220,7 @@ static void cpu_idle_loop(void) rmb(); if (cpu_is_offline(smp_processor_id())) { - rcu_cpu_notify(NULL, CPU_DYING_IDLE, - (void *)(long)smp_processor_id()); - smp_mb(); /* all activity before dead. */ - this_cpu_write(cpu_dead_idle, true); + cpuhp_report_idle_dead(); arch_cpu_idle_dead(); } @@ -291,5 +287,6 @@ void cpu_startup_entry(enum cpuhp_state state) boot_init_stack_canary(); #endif arch_cpu_idle_prepare(); + cpuhp_online_idle(state); cpu_idle_loop(); } diff --git a/kernel/smp.c b/kernel/smp.c index 300d29391e07..74165443c240 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -568,6 +568,7 @@ void __init smp_init(void) unsigned int cpu; idle_threads_init(); + cpuhp_threads_init(); /* FIXME: This should be done in userspace --RR */ for_each_present_cpu(cpu) { diff --git a/kernel/smpboot.c b/kernel/smpboot.c index d264f59bff56..13bc43d1fb22 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -226,7 +226,7 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp kthread_unpark(tsk); } -void smpboot_unpark_threads(unsigned int cpu) +int smpboot_unpark_threads(unsigned int cpu) { struct smp_hotplug_thread *cur; @@ -235,6 +235,7 @@ void smpboot_unpark_threads(unsigned int cpu) if (cpumask_test_cpu(cpu, cur->cpumask)) smpboot_unpark_thread(cur, cpu); mutex_unlock(&smpboot_threads_lock); + return 0; } static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu) @@ -245,7 +246,7 @@ static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu) kthread_park(tsk); } -void smpboot_park_threads(unsigned int cpu) +int smpboot_park_threads(unsigned int cpu) { struct smp_hotplug_thread *cur; @@ -253,6 +254,7 @@ void smpboot_park_threads(unsigned int cpu) list_for_each_entry_reverse(cur, &hotplug_threads, list) smpboot_park_thread(cur, cpu); mutex_unlock(&smpboot_threads_lock); + return 0; } static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) diff --git a/kernel/smpboot.h b/kernel/smpboot.h index 72415a0eb955..485b81cfab34 100644 --- a/kernel/smpboot.h +++ b/kernel/smpboot.h @@ -14,7 +14,9 @@ static inline void idle_threads_init(void) { } #endif int smpboot_create_threads(unsigned int cpu); -void smpboot_park_threads(unsigned int cpu); -void smpboot_unpark_threads(unsigned int cpu); +int smpboot_park_threads(unsigned int cpu); +int smpboot_unpark_threads(unsigned int cpu); + +void __init cpuhp_threads_init(void); #endif diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 664de539299b..56ece145a814 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -323,13 +323,42 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) /* cs is a watchdog. */ if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; - /* Pick the best watchdog. */ - if (!watchdog || cs->rating > watchdog->rating) { - watchdog = cs; - /* Reset watchdog cycles */ - clocksource_reset_watchdog(); - } } + spin_unlock_irqrestore(&watchdog_lock, flags); +} + +static void clocksource_select_watchdog(bool fallback) +{ + struct clocksource *cs, *old_wd; + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + /* save current watchdog */ + old_wd = watchdog; + if (fallback) + watchdog = NULL; + + list_for_each_entry(cs, &clocksource_list, list) { + /* cs is a clocksource to be watched. */ + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) + continue; + + /* Skip current if we were requested for a fallback. */ + if (fallback && cs == old_wd) + continue; + + /* Pick the best watchdog. */ + if (!watchdog || cs->rating > watchdog->rating) + watchdog = cs; + } + /* If we failed to find a fallback restore the old one. */ + if (!watchdog) + watchdog = old_wd; + + /* If we changed the watchdog we need to reset cycles. */ + if (watchdog != old_wd) + clocksource_reset_watchdog(); + /* Check if the watchdog timer needs to be started. */ clocksource_start_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); @@ -404,6 +433,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; } +static void clocksource_select_watchdog(bool fallback) { } static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } static inline int __clocksource_watchdog_kthread(void) { return 0; } @@ -736,6 +766,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) clocksource_enqueue(cs); clocksource_enqueue_watchdog(cs); clocksource_select(); + clocksource_select_watchdog(false); mutex_unlock(&clocksource_mutex); return 0; } @@ -758,6 +789,7 @@ void clocksource_change_rating(struct clocksource *cs, int rating) mutex_lock(&clocksource_mutex); __clocksource_change_rating(cs, rating); clocksource_select(); + clocksource_select_watchdog(false); mutex_unlock(&clocksource_mutex); } EXPORT_SYMBOL(clocksource_change_rating); @@ -767,12 +799,12 @@ EXPORT_SYMBOL(clocksource_change_rating); */ static int clocksource_unbind(struct clocksource *cs) { - /* - * I really can't convince myself to support this on hardware - * designed by lobotomized monkeys. - */ - if (clocksource_is_watchdog(cs)) - return -EBUSY; + if (clocksource_is_watchdog(cs)) { + /* Select and try to install a replacement watchdog. */ + clocksource_select_watchdog(true); + if (clocksource_is_watchdog(cs)) + return -EBUSY; + } if (cs == curr_clocksource) { /* Select and try to install a replacement clock source */ diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 347fecf86a3f..555e21f7b966 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -68,7 +68,7 @@ static struct clocksource clocksource_jiffies = { .name = "jiffies", .rating = 1, /* lowest valid rating*/ .read = jiffies_read, - .mask = 0xffffffff, /*32bits*/ + .mask = CLOCKSOURCE_MASK(32), .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ .shift = JIFFIES_SHIFT, .max_cycles = 10, diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 34b4cedfa80d..9c629bbed572 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -233,6 +233,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) u64 tmp, ntpinterval; struct clocksource *old_clock; + ++tk->cs_was_changed_seq; old_clock = tk->tkr_mono.clock; tk->tkr_mono.clock = clock; tk->tkr_mono.read = clock->read; @@ -298,19 +299,36 @@ u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset; static inline u32 arch_gettimeoffset(void) { return 0; } #endif -static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) +static inline s64 timekeeping_delta_to_ns(struct tk_read_base *tkr, + cycle_t delta) { - cycle_t delta; s64 nsec; - delta = timekeeping_get_delta(tkr); - - nsec = (delta * tkr->mult + tkr->xtime_nsec) >> tkr->shift; + nsec = delta * tkr->mult + tkr->xtime_nsec; + nsec >>= tkr->shift; /* If arch requires, add in get_arch_timeoffset() */ return nsec + arch_gettimeoffset(); } +static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) +{ + cycle_t delta; + + delta = timekeeping_get_delta(tkr); + return timekeeping_delta_to_ns(tkr, delta); +} + +static inline s64 timekeeping_cycles_to_ns(struct tk_read_base *tkr, + cycle_t cycles) +{ + cycle_t delta; + + /* calculate the delta since the last update_wall_time */ + delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask); + return timekeeping_delta_to_ns(tkr, delta); +} + /** * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. * @tkr: Timekeeping readout base from which we take the update @@ -857,44 +875,262 @@ time64_t __ktime_get_real_seconds(void) return tk->xtime_sec; } - -#ifdef CONFIG_NTP_PPS - /** - * ktime_get_raw_and_real_ts64 - get day and raw monotonic time in timespec format - * @ts_raw: pointer to the timespec to be set to raw monotonic time - * @ts_real: pointer to the timespec to be set to the time of day - * - * This function reads both the time of day and raw monotonic time at the - * same time atomically and stores the resulting timestamps in timespec - * format. + * ktime_get_snapshot - snapshots the realtime/monotonic raw clocks with counter + * @systime_snapshot: pointer to struct receiving the system time snapshot */ -void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, struct timespec64 *ts_real) +void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; - s64 nsecs_raw, nsecs_real; + ktime_t base_raw; + ktime_t base_real; + s64 nsec_raw; + s64 nsec_real; + cycle_t now; WARN_ON_ONCE(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); - *ts_raw = tk->raw_time; - ts_real->tv_sec = tk->xtime_sec; - ts_real->tv_nsec = 0; - - nsecs_raw = timekeeping_get_ns(&tk->tkr_raw); - nsecs_real = timekeeping_get_ns(&tk->tkr_mono); - + now = tk->tkr_mono.read(tk->tkr_mono.clock); + systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq; + systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq; + base_real = ktime_add(tk->tkr_mono.base, + tk_core.timekeeper.offs_real); + base_raw = tk->tkr_raw.base; + nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now); + nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now); } while (read_seqcount_retry(&tk_core.seq, seq)); - timespec64_add_ns(ts_raw, nsecs_raw); - timespec64_add_ns(ts_real, nsecs_real); + systime_snapshot->cycles = now; + systime_snapshot->real = ktime_add_ns(base_real, nsec_real); + systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw); } -EXPORT_SYMBOL(ktime_get_raw_and_real_ts64); +EXPORT_SYMBOL_GPL(ktime_get_snapshot); -#endif /* CONFIG_NTP_PPS */ +/* Scale base by mult/div checking for overflow */ +static int scale64_check_overflow(u64 mult, u64 div, u64 *base) +{ + u64 tmp, rem; + + tmp = div64_u64_rem(*base, div, &rem); + + if (((int)sizeof(u64)*8 - fls64(mult) < fls64(tmp)) || + ((int)sizeof(u64)*8 - fls64(mult) < fls64(rem))) + return -EOVERFLOW; + tmp *= mult; + rem *= mult; + + do_div(rem, div); + *base = tmp + rem; + return 0; +} + +/** + * adjust_historical_crosststamp - adjust crosstimestamp previous to current interval + * @history: Snapshot representing start of history + * @partial_history_cycles: Cycle offset into history (fractional part) + * @total_history_cycles: Total history length in cycles + * @discontinuity: True indicates clock was set on history period + * @ts: Cross timestamp that should be adjusted using + * partial/total ratio + * + * Helper function used by get_device_system_crosststamp() to correct the + * crosstimestamp corresponding to the start of the current interval to the + * system counter value (timestamp point) provided by the driver. The + * total_history_* quantities are the total history starting at the provided + * reference point and ending at the start of the current interval. The cycle + * count between the driver timestamp point and the start of the current + * interval is partial_history_cycles. + */ +static int adjust_historical_crosststamp(struct system_time_snapshot *history, + cycle_t partial_history_cycles, + cycle_t total_history_cycles, + bool discontinuity, + struct system_device_crosststamp *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + u64 corr_raw, corr_real; + bool interp_forward; + int ret; + + if (total_history_cycles == 0 || partial_history_cycles == 0) + return 0; + + /* Interpolate shortest distance from beginning or end of history */ + interp_forward = partial_history_cycles > total_history_cycles/2 ? + true : false; + partial_history_cycles = interp_forward ? + total_history_cycles - partial_history_cycles : + partial_history_cycles; + + /* + * Scale the monotonic raw time delta by: + * partial_history_cycles / total_history_cycles + */ + corr_raw = (u64)ktime_to_ns( + ktime_sub(ts->sys_monoraw, history->raw)); + ret = scale64_check_overflow(partial_history_cycles, + total_history_cycles, &corr_raw); + if (ret) + return ret; + + /* + * If there is a discontinuity in the history, scale monotonic raw + * correction by: + * mult(real)/mult(raw) yielding the realtime correction + * Otherwise, calculate the realtime correction similar to monotonic + * raw calculation + */ + if (discontinuity) { + corr_real = mul_u64_u32_div + (corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult); + } else { + corr_real = (u64)ktime_to_ns( + ktime_sub(ts->sys_realtime, history->real)); + ret = scale64_check_overflow(partial_history_cycles, + total_history_cycles, &corr_real); + if (ret) + return ret; + } + + /* Fixup monotonic raw and real time time values */ + if (interp_forward) { + ts->sys_monoraw = ktime_add_ns(history->raw, corr_raw); + ts->sys_realtime = ktime_add_ns(history->real, corr_real); + } else { + ts->sys_monoraw = ktime_sub_ns(ts->sys_monoraw, corr_raw); + ts->sys_realtime = ktime_sub_ns(ts->sys_realtime, corr_real); + } + + return 0; +} + +/* + * cycle_between - true if test occurs chronologically between before and after + */ +static bool cycle_between(cycle_t before, cycle_t test, cycle_t after) +{ + if (test > before && test < after) + return true; + if (test < before && before > after) + return true; + return false; +} + +/** + * get_device_system_crosststamp - Synchronously capture system/device timestamp + * @get_time_fn: Callback to get simultaneous device time and + * system counter from the device driver + * @ctx: Context passed to get_time_fn() + * @history_begin: Historical reference point used to interpolate system + * time when counter provided by the driver is before the current interval + * @xtstamp: Receives simultaneously captured system and device time + * + * Reads a timestamp from a device and correlates it to system time + */ +int get_device_system_crosststamp(int (*get_time_fn) + (ktime_t *device_time, + struct system_counterval_t *sys_counterval, + void *ctx), + void *ctx, + struct system_time_snapshot *history_begin, + struct system_device_crosststamp *xtstamp) +{ + struct system_counterval_t system_counterval; + struct timekeeper *tk = &tk_core.timekeeper; + cycle_t cycles, now, interval_start; + unsigned int clock_was_set_seq = 0; + ktime_t base_real, base_raw; + s64 nsec_real, nsec_raw; + u8 cs_was_changed_seq; + unsigned long seq; + bool do_interp; + int ret; + + do { + seq = read_seqcount_begin(&tk_core.seq); + /* + * Try to synchronously capture device time and a system + * counter value calling back into the device driver + */ + ret = get_time_fn(&xtstamp->device, &system_counterval, ctx); + if (ret) + return ret; + + /* + * Verify that the clocksource associated with the captured + * system counter value is the same as the currently installed + * timekeeper clocksource + */ + if (tk->tkr_mono.clock != system_counterval.cs) + return -ENODEV; + cycles = system_counterval.cycles; + + /* + * Check whether the system counter value provided by the + * device driver is on the current timekeeping interval. + */ + now = tk->tkr_mono.read(tk->tkr_mono.clock); + interval_start = tk->tkr_mono.cycle_last; + if (!cycle_between(interval_start, cycles, now)) { + clock_was_set_seq = tk->clock_was_set_seq; + cs_was_changed_seq = tk->cs_was_changed_seq; + cycles = interval_start; + do_interp = true; + } else { + do_interp = false; + } + + base_real = ktime_add(tk->tkr_mono.base, + tk_core.timekeeper.offs_real); + base_raw = tk->tkr_raw.base; + + nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, + system_counterval.cycles); + nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, + system_counterval.cycles); + } while (read_seqcount_retry(&tk_core.seq, seq)); + + xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real); + xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw); + + /* + * Interpolate if necessary, adjusting back from the start of the + * current interval + */ + if (do_interp) { + cycle_t partial_history_cycles, total_history_cycles; + bool discontinuity; + + /* + * Check that the counter value occurs after the provided + * history reference and that the history doesn't cross a + * clocksource change + */ + if (!history_begin || + !cycle_between(history_begin->cycles, + system_counterval.cycles, cycles) || + history_begin->cs_was_changed_seq != cs_was_changed_seq) + return -EINVAL; + partial_history_cycles = cycles - system_counterval.cycles; + total_history_cycles = cycles - history_begin->cycles; + discontinuity = + history_begin->clock_was_set_seq != clock_was_set_seq; + + ret = adjust_historical_crosststamp(history_begin, + partial_history_cycles, + total_history_cycles, + discontinuity, xtstamp); + if (ret) + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(get_device_system_crosststamp); /** * do_gettimeofday - Returns the time of day in a timeval diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 8bfd1aca7a3d..f28f7fad452f 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1442,6 +1442,19 @@ config DEBUG_BLOCK_EXT_DEVT Say N if you are unsure. +config CPU_HOTPLUG_STATE_CONTROL + bool "Enable CPU hotplug state control" + depends on DEBUG_KERNEL + depends on HOTPLUG_CPU + default n + help + Allows to write steps between "offline" and "online" to the CPUs + sysfs target file so states can be stepped granular. This is a debug + option for now as the hotplug machinery cannot be stopped and + restarted at arbitrary points yet. + + Say N if your are unsure. + config NOTIFIER_ERROR_INJECTION tristate "Notifier error injection" depends on DEBUG_KERNEL diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c index d62de8bf022d..123481814320 100644 --- a/lib/atomic64_test.c +++ b/lib/atomic64_test.c @@ -17,7 +17,7 @@ #include #ifdef CONFIG_X86 -#include /* for boot_cpu_has below */ +#include /* for boot_cpu_has below */ #endif #define TEST(bit, op, c_op, val) \ diff --git a/mm/memory.c b/mm/memory.c index 8132787ae4d5..906d8e3b42c0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1550,9 +1550,30 @@ out: */ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn) +{ + return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_insert_pfn); + +/** + * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot + * @vma: user vma to map to + * @addr: target user address of this page + * @pfn: source kernel pfn + * @pgprot: pgprot flags for the inserted page + * + * This is exactly like vm_insert_pfn, except that it allows drivers to + * to override pgprot on a per-page basis. + * + * This only makes sense for IO mappings, and it makes no sense for + * cow mappings. In general, using multiple vmas is preferable; + * vm_insert_pfn_prot should only be used if using multiple VMAs is + * impractical. + */ +int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, pgprot_t pgprot) { int ret; - pgprot_t pgprot = vma->vm_page_prot; /* * Technically, architectures with pte_special can avoid all these * restrictions (same for remap_pfn_range). However we would like @@ -1574,7 +1595,7 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, return ret; } -EXPORT_SYMBOL(vm_insert_pfn); +EXPORT_SYMBOL(vm_insert_pfn_prot); int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn) diff --git a/mm/mmap.c b/mm/mmap.c index 76d1ec29149b..90e3b869a8b9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3066,11 +3066,16 @@ static int special_mapping_fault(struct vm_area_struct *vma, pgoff_t pgoff; struct page **pages; - if (vma->vm_ops == &legacy_special_mapping_vmops) + if (vma->vm_ops == &legacy_special_mapping_vmops) { pages = vma->vm_private_data; - else - pages = ((struct vm_special_mapping *)vma->vm_private_data)-> - pages; + } else { + struct vm_special_mapping *sm = vma->vm_private_data; + + if (sm->fault) + return sm->fault(sm, vma, vmf); + + pages = sm->pages; + } for (pgoff = vmf->pgoff; pgoff && *pages; ++pages) pgoff--; diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 0147c91fa549..874132b26d23 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -269,7 +269,8 @@ our $Sparse = qr{ __init_refok| __kprobes| __ref| - __rcu + __rcu| + __private }x; our $InitAttributePrefix = qr{__(?:mem|cpu|dev|net_|)}; our $InitAttributeData = qr{$InitAttributePrefix(?:initdata\b)}; diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh index 844787a0d7be..5eb49b7f864c 100755 --- a/tools/testing/selftests/rcutorture/bin/parse-console.sh +++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh @@ -33,7 +33,7 @@ if grep -Pq '\x00' < $file then print_warning Console output contains nul bytes, old qemu still running? fi -egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|Stall ended before state dump start' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $1.diags +egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $1.diags if test -s $1.diags then print_warning Assertion failure in $file $title @@ -64,10 +64,12 @@ then then summary="$summary lockdep: $n_badness" fi - n_stalls=`egrep -c 'detected stalls on CPUs/tasks:|Stall ended before state dump start' $1` + n_stalls=`egrep -c 'detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state' $1` if test "$n_stalls" -ne 0 then summary="$summary Stalls: $n_stalls" fi print_warning Summary: $summary +else + rm $1.diags fi diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index d0c473f65850..d5ce7d7aae3e 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -4,15 +4,16 @@ include ../lib.mk .PHONY: all all_32 all_64 warn_32bit_failure clean -TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall -TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso unwind_vdso \ +TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall \ + check_initial_reg_state sigreturn ldt_gdt +TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ - ldt_gdt \ vdso_restorer TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY) +TARGETS_C_64BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_64BIT_ONLY) BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32) -BINARIES_64 := $(TARGETS_C_BOTHBITS:%=%_64) +BINARIES_64 := $(TARGETS_C_64BIT_ALL:%=%_64) CFLAGS := -O2 -g -std=gnu99 -pthread -Wall @@ -40,7 +41,7 @@ clean: $(TARGETS_C_32BIT_ALL:%=%_32): %_32: %.c $(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl -lm -$(TARGETS_C_BOTHBITS:%=%_64): %_64: %.c +$(TARGETS_C_64BIT_ALL:%=%_64): %_64: %.c $(CC) -m64 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl # x86_64 users should be encouraged to install 32-bit libraries @@ -65,3 +66,9 @@ endif sysret_ss_attrs_64: thunks.S ptrace_syscall_32: raw_syscall_helper_32.S test_syscall_vdso_32: thunks_32.S + +# check_initial_reg_state is special: it needs a custom entry, and it +# needs to be static so that its interpreter doesn't destroy its initial +# state. +check_initial_reg_state_32: CFLAGS += -Wl,-ereal_start -static +check_initial_reg_state_64: CFLAGS += -Wl,-ereal_start -static diff --git a/tools/testing/selftests/x86/check_initial_reg_state.c b/tools/testing/selftests/x86/check_initial_reg_state.c new file mode 100644 index 000000000000..6aaed9b85baf --- /dev/null +++ b/tools/testing/selftests/x86/check_initial_reg_state.c @@ -0,0 +1,109 @@ +/* + * check_initial_reg_state.c - check that execve sets the correct state + * Copyright (c) 2014-2016 Andrew Lutomirski + * + * This program is free software; you can redistribute it and/or modify + * it under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#define _GNU_SOURCE + +#include + +unsigned long ax, bx, cx, dx, si, di, bp, sp, flags; +unsigned long r8, r9, r10, r11, r12, r13, r14, r15; + +asm ( + ".pushsection .text\n\t" + ".type real_start, @function\n\t" + ".global real_start\n\t" + "real_start:\n\t" +#ifdef __x86_64__ + "mov %rax, ax\n\t" + "mov %rbx, bx\n\t" + "mov %rcx, cx\n\t" + "mov %rdx, dx\n\t" + "mov %rsi, si\n\t" + "mov %rdi, di\n\t" + "mov %rbp, bp\n\t" + "mov %rsp, sp\n\t" + "mov %r8, r8\n\t" + "mov %r9, r9\n\t" + "mov %r10, r10\n\t" + "mov %r11, r11\n\t" + "mov %r12, r12\n\t" + "mov %r13, r13\n\t" + "mov %r14, r14\n\t" + "mov %r15, r15\n\t" + "pushfq\n\t" + "popq flags\n\t" +#else + "mov %eax, ax\n\t" + "mov %ebx, bx\n\t" + "mov %ecx, cx\n\t" + "mov %edx, dx\n\t" + "mov %esi, si\n\t" + "mov %edi, di\n\t" + "mov %ebp, bp\n\t" + "mov %esp, sp\n\t" + "pushfl\n\t" + "popl flags\n\t" +#endif + "jmp _start\n\t" + ".size real_start, . - real_start\n\t" + ".popsection"); + +int main() +{ + int nerrs = 0; + + if (sp == 0) { + printf("[FAIL]\tTest was built incorrectly\n"); + return 1; + } + + if (ax || bx || cx || dx || si || di || bp +#ifdef __x86_64__ + || r8 || r9 || r10 || r11 || r12 || r13 || r14 || r15 +#endif + ) { + printf("[FAIL]\tAll GPRs except SP should be 0\n"); +#define SHOW(x) printf("\t" #x " = 0x%lx\n", x); + SHOW(ax); + SHOW(bx); + SHOW(cx); + SHOW(dx); + SHOW(si); + SHOW(di); + SHOW(bp); + SHOW(sp); +#ifdef __x86_64__ + SHOW(r8); + SHOW(r9); + SHOW(r10); + SHOW(r11); + SHOW(r12); + SHOW(r13); + SHOW(r14); + SHOW(r15); +#endif + nerrs++; + } else { + printf("[OK]\tAll GPRs except SP are 0\n"); + } + + if (flags != 0x202) { + printf("[FAIL]\tFLAGS is 0x%lx, but it should be 0x202\n", flags); + nerrs++; + } else { + printf("[OK]\tFLAGS is 0x202\n"); + } + + return nerrs ? 1 : 0; +} diff --git a/tools/testing/selftests/x86/ptrace_syscall.c b/tools/testing/selftests/x86/ptrace_syscall.c index 5105b49cd8aa..421456784bc6 100644 --- a/tools/testing/selftests/x86/ptrace_syscall.c +++ b/tools/testing/selftests/x86/ptrace_syscall.c @@ -103,6 +103,17 @@ static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), err(1, "sigaction"); } +static void setsigign(int sig, int flags) +{ + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = (void *)SIG_IGN; + sa.sa_flags = flags; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + err(1, "sigaction"); +} + static void clearhandler(int sig) { struct sigaction sa; @@ -187,7 +198,7 @@ static void test_ptrace_syscall_restart(void) printf("[RUN]\tSYSEMU\n"); if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0) - err(1, "PTRACE_SYSCALL"); + err(1, "PTRACE_SYSEMU"); wait_trap(chld); if (ptrace(PTRACE_GETREGS, chld, 0, ®s) != 0) @@ -218,7 +229,7 @@ static void test_ptrace_syscall_restart(void) err(1, "PTRACE_SETREGS"); if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0) - err(1, "PTRACE_SYSCALL"); + err(1, "PTRACE_SYSEMU"); wait_trap(chld); if (ptrace(PTRACE_GETREGS, chld, 0, ®s) != 0) @@ -250,7 +261,7 @@ static void test_ptrace_syscall_restart(void) err(1, "PTRACE_SETREGS"); if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0) - err(1, "PTRACE_SYSCALL"); + err(1, "PTRACE_SYSEMU"); wait_trap(chld); if (ptrace(PTRACE_GETREGS, chld, 0, ®s) != 0) @@ -277,6 +288,119 @@ static void test_ptrace_syscall_restart(void) } } +static void test_restart_under_ptrace(void) +{ + printf("[RUN]\tkernel syscall restart under ptrace\n"); + pid_t chld = fork(); + if (chld < 0) + err(1, "fork"); + + if (chld == 0) { + if (ptrace(PTRACE_TRACEME, 0, 0, 0) != 0) + err(1, "PTRACE_TRACEME"); + + printf("\tChild will take a nap until signaled\n"); + setsigign(SIGUSR1, SA_RESTART); + raise(SIGSTOP); + + syscall(SYS_pause, 0, 0, 0, 0, 0, 0); + _exit(0); + } + + int status; + + /* Wait for SIGSTOP. */ + if (waitpid(chld, &status, 0) != chld || !WIFSTOPPED(status)) + err(1, "waitpid"); + + struct user_regs_struct regs; + + printf("[RUN]\tSYSCALL\n"); + if (ptrace(PTRACE_SYSCALL, chld, 0, 0) != 0) + err(1, "PTRACE_SYSCALL"); + wait_trap(chld); + + /* We should be stopped at pause(2) entry. */ + + if (ptrace(PTRACE_GETREGS, chld, 0, ®s) != 0) + err(1, "PTRACE_GETREGS"); + + if (regs.user_syscall_nr != SYS_pause || + regs.user_arg0 != 0 || regs.user_arg1 != 0 || + regs.user_arg2 != 0 || regs.user_arg3 != 0 || + regs.user_arg4 != 0 || regs.user_arg5 != 0) { + printf("[FAIL]\tInitial args are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5); + nerrs++; + } else { + printf("[OK]\tInitial nr and args are correct\n"); + } + + /* Interrupt it. */ + kill(chld, SIGUSR1); + + /* Advance. We should be stopped at exit. */ + printf("[RUN]\tSYSCALL\n"); + if (ptrace(PTRACE_SYSCALL, chld, 0, 0) != 0) + err(1, "PTRACE_SYSCALL"); + wait_trap(chld); + + if (ptrace(PTRACE_GETREGS, chld, 0, ®s) != 0) + err(1, "PTRACE_GETREGS"); + + if (regs.user_syscall_nr != SYS_pause || + regs.user_arg0 != 0 || regs.user_arg1 != 0 || + regs.user_arg2 != 0 || regs.user_arg3 != 0 || + regs.user_arg4 != 0 || regs.user_arg5 != 0) { + printf("[FAIL]\tArgs after SIGUSR1 are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5); + nerrs++; + } else { + printf("[OK]\tArgs after SIGUSR1 are correct (ax = %ld)\n", + (long)regs.user_ax); + } + + /* Poke the regs back in. This must not break anything. */ + if (ptrace(PTRACE_SETREGS, chld, 0, ®s) != 0) + err(1, "PTRACE_SETREGS"); + + /* Catch the (ignored) SIGUSR1. */ + if (ptrace(PTRACE_CONT, chld, 0, 0) != 0) + err(1, "PTRACE_CONT"); + if (waitpid(chld, &status, 0) != chld) + err(1, "waitpid"); + if (!WIFSTOPPED(status)) { + printf("[FAIL]\tChild was stopped for SIGUSR1 (status = 0x%x)\n", status); + nerrs++; + } else { + printf("[OK]\tChild got SIGUSR1\n"); + } + + /* The next event should be pause(2) again. */ + printf("[RUN]\tStep again\n"); + if (ptrace(PTRACE_SYSCALL, chld, 0, 0) != 0) + err(1, "PTRACE_SYSCALL"); + wait_trap(chld); + + /* We should be stopped at pause(2) entry. */ + + if (ptrace(PTRACE_GETREGS, chld, 0, ®s) != 0) + err(1, "PTRACE_GETREGS"); + + if (regs.user_syscall_nr != SYS_pause || + regs.user_arg0 != 0 || regs.user_arg1 != 0 || + regs.user_arg2 != 0 || regs.user_arg3 != 0 || + regs.user_arg4 != 0 || regs.user_arg5 != 0) { + printf("[FAIL]\tpause did not restart (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5); + nerrs++; + } else { + printf("[OK]\tpause(2) restarted correctly\n"); + } + + /* Kill it. */ + kill(chld, SIGKILL); + if (waitpid(chld, &status, 0) != chld) + err(1, "waitpid"); +} + int main() { printf("[RUN]\tCheck int80 return regs\n"); @@ -290,5 +414,7 @@ int main() test_ptrace_syscall_restart(); + test_restart_under_ptrace(); + return 0; } diff --git a/tools/testing/selftests/x86/sigreturn.c b/tools/testing/selftests/x86/sigreturn.c index b5aa1bab7416..8a577e7070c6 100644 --- a/tools/testing/selftests/x86/sigreturn.c +++ b/tools/testing/selftests/x86/sigreturn.c @@ -54,6 +54,37 @@ #include #include +/* Pull in AR_xyz defines. */ +typedef unsigned int u32; +typedef unsigned short u16; +#include "../../../../arch/x86/include/asm/desc_defs.h" + +/* + * Copied from asm/ucontext.h, as asm/ucontext.h conflicts badly with the glibc + * headers. + */ +#ifdef __x86_64__ +/* + * UC_SIGCONTEXT_SS will be set when delivering 64-bit or x32 signals on + * kernels that save SS in the sigcontext. All kernels that set + * UC_SIGCONTEXT_SS will correctly restore at least the low 32 bits of esp + * regardless of SS (i.e. they implement espfix). + * + * Kernels that set UC_SIGCONTEXT_SS will also set UC_STRICT_RESTORE_SS + * when delivering a signal that came from 64-bit code. + * + * Sigreturn restores SS as follows: + * + * if (saved SS is valid || UC_STRICT_RESTORE_SS is set || + * saved CS is not 64-bit) + * new SS = saved SS (will fail IRET and signal if invalid) + * else + * new SS = a flat 32-bit data segment + */ +#define UC_SIGCONTEXT_SS 0x2 +#define UC_STRICT_RESTORE_SS 0x4 +#endif + /* * In principle, this test can run on Linux emulation layers (e.g. * Illumos "LX branded zones"). Solaris-based kernels reserve LDT @@ -267,6 +298,9 @@ static gregset_t initial_regs, requested_regs, resulting_regs; /* Instructions for the SIGUSR1 handler. */ static volatile unsigned short sig_cs, sig_ss; static volatile sig_atomic_t sig_trapped, sig_err, sig_trapno; +#ifdef __x86_64__ +static volatile sig_atomic_t sig_corrupt_final_ss; +#endif /* Abstractions for some 32-bit vs 64-bit differences. */ #ifdef __x86_64__ @@ -305,62 +339,6 @@ static greg_t *csptr(ucontext_t *ctx) } #endif -/* Number of errors in the current test case. */ -static volatile sig_atomic_t nerrs; - -/* - * SIGUSR1 handler. Sets CS and SS as requested and points IP to the - * int3 trampoline. Sets SP to a large known value so that we can see - * whether the value round-trips back to user mode correctly. - */ -static void sigusr1(int sig, siginfo_t *info, void *ctx_void) -{ - ucontext_t *ctx = (ucontext_t*)ctx_void; - - memcpy(&initial_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t)); - - *csptr(ctx) = sig_cs; - *ssptr(ctx) = sig_ss; - - ctx->uc_mcontext.gregs[REG_IP] = - sig_cs == code16_sel ? 0 : (unsigned long)&int3; - ctx->uc_mcontext.gregs[REG_SP] = (unsigned long)0x8badf00d5aadc0deULL; - ctx->uc_mcontext.gregs[REG_AX] = 0; - - memcpy(&requested_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t)); - requested_regs[REG_AX] = *ssptr(ctx); /* The asm code does this. */ - - return; -} - -/* - * Called after a successful sigreturn. Restores our state so that - * the original raise(SIGUSR1) returns. - */ -static void sigtrap(int sig, siginfo_t *info, void *ctx_void) -{ - ucontext_t *ctx = (ucontext_t*)ctx_void; - - sig_err = ctx->uc_mcontext.gregs[REG_ERR]; - sig_trapno = ctx->uc_mcontext.gregs[REG_TRAPNO]; - - unsigned short ss; - asm ("mov %%ss,%0" : "=r" (ss)); - - greg_t asm_ss = ctx->uc_mcontext.gregs[REG_AX]; - if (asm_ss != sig_ss && sig == SIGTRAP) { - /* Sanity check failure. */ - printf("[FAIL]\tSIGTRAP: ss = %hx, frame ss = %hx, ax = %llx\n", - ss, *ssptr(ctx), (unsigned long long)asm_ss); - nerrs++; - } - - memcpy(&resulting_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t)); - memcpy(&ctx->uc_mcontext.gregs, &initial_regs, sizeof(gregset_t)); - - sig_trapped = sig; -} - /* * Checks a given selector for its code bitness or returns -1 if it's not * a usable code segment selector. @@ -394,6 +372,184 @@ int cs_bitness(unsigned short cs) return -1; /* Unknown bitness. */ } +/* + * Checks a given selector for its code bitness or returns -1 if it's not + * a usable code segment selector. + */ +bool is_valid_ss(unsigned short cs) +{ + uint32_t valid = 0, ar; + asm ("lar %[cs], %[ar]\n\t" + "jnz 1f\n\t" + "mov $1, %[valid]\n\t" + "1:" + : [ar] "=r" (ar), [valid] "+rm" (valid) + : [cs] "r" (cs)); + + if (!valid) + return false; + + if ((ar & AR_TYPE_MASK) != AR_TYPE_RWDATA && + (ar & AR_TYPE_MASK) != AR_TYPE_RWDATA_EXPDOWN) + return false; + + return (ar & AR_P); +} + +/* Number of errors in the current test case. */ +static volatile sig_atomic_t nerrs; + +static void validate_signal_ss(int sig, ucontext_t *ctx) +{ +#ifdef __x86_64__ + bool was_64bit = (cs_bitness(*csptr(ctx)) == 64); + + if (!(ctx->uc_flags & UC_SIGCONTEXT_SS)) { + printf("[FAIL]\tUC_SIGCONTEXT_SS was not set\n"); + nerrs++; + + /* + * This happens on Linux 4.1. The rest will fail, too, so + * return now to reduce the noise. + */ + return; + } + + /* UC_STRICT_RESTORE_SS is set iff we came from 64-bit mode. */ + if (!!(ctx->uc_flags & UC_STRICT_RESTORE_SS) != was_64bit) { + printf("[FAIL]\tUC_STRICT_RESTORE_SS was wrong in signal %d\n", + sig); + nerrs++; + } + + if (is_valid_ss(*ssptr(ctx))) { + /* + * DOSEMU was written before 64-bit sigcontext had SS, and + * it tries to figure out the signal source SS by looking at + * the physical register. Make sure that keeps working. + */ + unsigned short hw_ss; + asm ("mov %%ss, %0" : "=rm" (hw_ss)); + if (hw_ss != *ssptr(ctx)) { + printf("[FAIL]\tHW SS didn't match saved SS\n"); + nerrs++; + } + } +#endif +} + +/* + * SIGUSR1 handler. Sets CS and SS as requested and points IP to the + * int3 trampoline. Sets SP to a large known value so that we can see + * whether the value round-trips back to user mode correctly. + */ +static void sigusr1(int sig, siginfo_t *info, void *ctx_void) +{ + ucontext_t *ctx = (ucontext_t*)ctx_void; + + validate_signal_ss(sig, ctx); + + memcpy(&initial_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t)); + + *csptr(ctx) = sig_cs; + *ssptr(ctx) = sig_ss; + + ctx->uc_mcontext.gregs[REG_IP] = + sig_cs == code16_sel ? 0 : (unsigned long)&int3; + ctx->uc_mcontext.gregs[REG_SP] = (unsigned long)0x8badf00d5aadc0deULL; + ctx->uc_mcontext.gregs[REG_AX] = 0; + + memcpy(&requested_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t)); + requested_regs[REG_AX] = *ssptr(ctx); /* The asm code does this. */ + + return; +} + +/* + * Called after a successful sigreturn (via int3) or from a failed + * sigreturn (directly by kernel). Restores our state so that the + * original raise(SIGUSR1) returns. + */ +static void sigtrap(int sig, siginfo_t *info, void *ctx_void) +{ + ucontext_t *ctx = (ucontext_t*)ctx_void; + + validate_signal_ss(sig, ctx); + + sig_err = ctx->uc_mcontext.gregs[REG_ERR]; + sig_trapno = ctx->uc_mcontext.gregs[REG_TRAPNO]; + + unsigned short ss; + asm ("mov %%ss,%0" : "=r" (ss)); + + greg_t asm_ss = ctx->uc_mcontext.gregs[REG_AX]; + if (asm_ss != sig_ss && sig == SIGTRAP) { + /* Sanity check failure. */ + printf("[FAIL]\tSIGTRAP: ss = %hx, frame ss = %hx, ax = %llx\n", + ss, *ssptr(ctx), (unsigned long long)asm_ss); + nerrs++; + } + + memcpy(&resulting_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t)); + memcpy(&ctx->uc_mcontext.gregs, &initial_regs, sizeof(gregset_t)); + +#ifdef __x86_64__ + if (sig_corrupt_final_ss) { + if (ctx->uc_flags & UC_STRICT_RESTORE_SS) { + printf("[FAIL]\tUC_STRICT_RESTORE_SS was set inappropriately\n"); + nerrs++; + } else { + /* + * DOSEMU transitions from 32-bit to 64-bit mode by + * adjusting sigcontext, and it requires that this work + * even if the saved SS is bogus. + */ + printf("\tCorrupting SS on return to 64-bit mode\n"); + *ssptr(ctx) = 0; + } + } +#endif + + sig_trapped = sig; +} + +#ifdef __x86_64__ +/* Tests recovery if !UC_STRICT_RESTORE_SS */ +static void sigusr2(int sig, siginfo_t *info, void *ctx_void) +{ + ucontext_t *ctx = (ucontext_t*)ctx_void; + + if (!(ctx->uc_flags & UC_STRICT_RESTORE_SS)) { + printf("[FAIL]\traise(2) didn't set UC_STRICT_RESTORE_SS\n"); + nerrs++; + return; /* We can't do the rest. */ + } + + ctx->uc_flags &= ~UC_STRICT_RESTORE_SS; + *ssptr(ctx) = 0; + + /* Return. The kernel should recover without sending another signal. */ +} + +static int test_nonstrict_ss(void) +{ + clearhandler(SIGUSR1); + clearhandler(SIGTRAP); + clearhandler(SIGSEGV); + clearhandler(SIGILL); + sethandler(SIGUSR2, sigusr2, 0); + + nerrs = 0; + + printf("[RUN]\tClear UC_STRICT_RESTORE_SS and corrupt SS\n"); + raise(SIGUSR2); + if (!nerrs) + printf("[OK]\tIt worked\n"); + + return nerrs; +} +#endif + /* Finds a usable code segment of the requested bitness. */ int find_cs(int bitness) { @@ -576,6 +732,12 @@ static int test_bad_iret(int cs_bits, unsigned short ss, int force_cs) errdesc, strsignal(sig_trapped)); return 0; } else { + /* + * This also implicitly tests UC_STRICT_RESTORE_SS: + * We check that these signals set UC_STRICT_RESTORE_SS and, + * if UC_STRICT_RESTORE_SS doesn't cause strict behavior, + * then we won't get SIGSEGV. + */ printf("[FAIL]\tDid not get SIGSEGV\n"); return 1; } @@ -632,6 +794,14 @@ int main() GDT3(gdt_data16_idx)); } +#ifdef __x86_64__ + /* Nasty ABI case: check SS corruption handling. */ + sig_corrupt_final_ss = 1; + total_nerrs += test_valid_sigreturn(32, false, -1); + total_nerrs += test_valid_sigreturn(32, true, -1); + sig_corrupt_final_ss = 0; +#endif + /* * We're done testing valid sigreturn cases. Now we test states * for which sigreturn itself will succeed but the subsequent @@ -680,5 +850,9 @@ int main() if (gdt_npdata32_idx) test_bad_iret(32, GDT3(gdt_npdata32_idx), -1); +#ifdef __x86_64__ + total_nerrs += test_nonstrict_ss(); +#endif + return total_nerrs ? 1 : 0; } diff --git a/tools/testing/selftests/x86/syscall_nt.c b/tools/testing/selftests/x86/syscall_nt.c index 60c06af4646a..43fcab367fb0 100644 --- a/tools/testing/selftests/x86/syscall_nt.c +++ b/tools/testing/selftests/x86/syscall_nt.c @@ -17,6 +17,9 @@ #include #include +#include +#include +#include #include #include @@ -26,6 +29,8 @@ # define WIDTH "l" #endif +static unsigned int nerrs; + static unsigned long get_eflags(void) { unsigned long eflags; @@ -39,16 +44,52 @@ static void set_eflags(unsigned long eflags) : : "rm" (eflags) : "flags"); } -int main() +static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), + int flags) { - printf("[RUN]\tSet NT and issue a syscall\n"); - set_eflags(get_eflags() | X86_EFLAGS_NT); + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = handler; + sa.sa_flags = SA_SIGINFO | flags; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + err(1, "sigaction"); +} + +static void sigtrap(int sig, siginfo_t *si, void *ctx_void) +{ +} + +static void do_it(unsigned long extraflags) +{ + unsigned long flags; + + set_eflags(get_eflags() | extraflags); syscall(SYS_getpid); - if (get_eflags() & X86_EFLAGS_NT) { - printf("[OK]\tThe syscall worked and NT is still set\n"); - return 0; + flags = get_eflags(); + if ((flags & extraflags) == extraflags) { + printf("[OK]\tThe syscall worked and flags are still set\n"); } else { - printf("[FAIL]\tThe syscall worked but NT was cleared\n"); - return 1; + printf("[FAIL]\tThe syscall worked but flags were cleared (flags = 0x%lx but expected 0x%lx set)\n", + flags, extraflags); + nerrs++; } } + +int main(void) +{ + printf("[RUN]\tSet NT and issue a syscall\n"); + do_it(X86_EFLAGS_NT); + + /* + * Now try it again with TF set -- TF forces returns via IRET in all + * cases except non-ptregs-using 64-bit full fast path syscalls. + */ + + sethandler(SIGTRAP, sigtrap, 0); + + printf("[RUN]\tSet NT|TF and issue a syscall\n"); + do_it(X86_EFLAGS_NT | X86_EFLAGS_TF); + + return nerrs == 0 ? 0 : 1; +}